def __init__(self, src, dst): sys.stdout.write("Reading files '" + src + "' and '" + dst + "'") sys.stdout.flush() source_lines = fopen(src, "r").readlines() destination_lines = fopen(dst, "r").readlines() self.sequences = self._make_sequences(source_lines, destination_lines) sys.stdout.write(" found " + str(len(self.sequences)) + " pairs\n")
def save(self, filename): f = fopen(filename, "w") f.write("LABELS " + str(len(self.label2int)) + "\n") for label in self.label2int: f.write(str(label) + "\t" + str(self.label2int[label]) + "\n") f.write("CHARACTERS " + str(len(self.char2int)) + "\n") for character in self.char2int: if sys.version_info[0] == 2: f.write( character.encode('utf-8') + "\t" + str(self.char2int[character]) + "\n") else: f.write(character + "\t" + str(self.char2int[character]) + "\n") f.write("WORDS " + str(len(self.word2int)) + "\n") for word in self.word2int: if sys.version_info[0] == 2: f.write( word.encode('utf-8') + "\t" + str(self.word2int[word]) + "\n") else: f.write(word + "\t" + str(self.word2int[word]) + "\n") f.write("UPOS " + str(len(self.upos2int)) + "\n") for label in self.upos2int: f.write(label + "\t" + str(self.upos2int[label]) + "\n") f.write("XPOS " + str(len(self.xpos2int)) + "\n") for label in self.xpos2int: f.write(label + "\t" + str(self.xpos2int[label]) + "\n") f.write("ATTRS " + str(len(self.attrs2int)) + "\n") for label in self.attrs2int: f.write(label + "\t" + str(self.attrs2int[label]) + "\n") f.close()
def write(self, filename): with fopen(filename, 'w') as file: for sequence in self.sequences: for entry in sequence: file.write(str(entry)) file.write("\n") file.write("\n")
def __init__(self, file=None): if file is not None: sys.stdout.write("Reading " + file + "... ") sys.stdout.flush() with fopen(file, "r") as f: lines = f.readlines() self.sequences = self._make_sequences(lines) sys.stdout.write("found " + str(len(self.sequences)) + " sequences\n")
def read_from_file(self, word_embeddings_file, word_list, full_load=False): self.word2vec = {} self.num_embeddings = 0 if word_list is None and not full_load: self.cache_only = True f = fopen(word_embeddings_file, "r") first_line = True while True: ofs = f.tell() line = f.readline() if line == '': break # print ofs line = line.replace("\n", "").replace("\r", "") if first_line: first_line = False else: self.num_embeddings += 1 if self.verbose: if self.num_embeddings % 10000 == 0: sys.stdout.write(" Scanned " + str(self.num_embeddings) + " word embeddings and added " + str(len(self.word2vec)) + " \n") parts = line.split(" ") if sys.version_info[0] == 2: word = parts[0].decode('utf-8') else: word = parts[0] if self.cache_only: self.word2ofs[word] = ofs elif full_load or word in word_list: embeddings = [float(0)] * (len(parts) - 2) for zz in range(len(parts) - 2): embeddings[zz] = float(parts[zz + 1]) self.word2vec[word] = embeddings self.word_embeddings_size = len(parts) - 2 f.close() if self.cache_only: self.file_pointer = fopen(word_embeddings_file, "r")
def load_language(self, file, lang_id, ignore_compound=False): sys.stdout.write("Reading " + file + "... ") sys.stdout.flush() with fopen(file, "r") as f: lines = f.readlines() ns = self._make_sequences(lines, lang_id=lang_id, ignore_compound=ignore_compound) for [seq, l_id] in ns: self.sequences.append([seq, l_id]) sys.stdout.write("found " + str(len(ns)) + " sequences\n")
def save(self, filename): assert (filename.endswith("metadata.json")) obj = {} obj["language"] = self.language obj["language_code"] = self.language_code obj["model_version"] = self.model_version obj["embeddings_remote_link"] = self.embeddings_remote_link obj["embeddings_file_name"] = self.embeddings_file_name obj["token_delimiter"] = self.token_delimiter obj["model_build_date"] = self.model_build_date obj["model_build_source"] = self.model_build_source obj["notes"] = self.notes json.dump(obj, fopen(filename, "w"), indent=4, sort_keys=True)
def read(self, filename): assert (filename.endswith("metadata.json")) data = json.load(fopen(filename, "r")) if sys.version_info[0] == 2: items = data.iteritems() else: items = data.items() for key, value in items: if key == "model_version": # safety check to keep the version as float if isinstance(value, str): self.__dict__[key] = float(value) else: self.__dict__[key] = value
def load_dict(self, path): #print ("Loading lemma dictionary") with fopen(path, "r") as f: lines = f.readlines() for line in lines: parts = line.strip().split('\t') if len(parts) == 5: if sys.version_info[0] == 2: word = unicode(parts[0], 'utf-8').lower().encode('utf-8') else: word = parts[0].lower() upos = parts[1] key = word + '\t' + upos self.word2lemma[key] = parts[4]
def read(self, filename): if not os.path.exists(filename): raise Exception("Metadata file [" + filename + "] not found!") if not filename.endswith("metadata.json"): raise Exception("Metadata file [" + filename + "] does not seem to be valid!") with fopen(filename, "r") as f: data = json.load(f) if sys.version_info[0] == 2: items = data.iteritems() else: items = data.items() for key, value in items: if key == "model_version": # safety check to keep the version as float if isinstance(value, str): self.__dict__[key] = float(value) else: self.__dict__[key] = value
def _download_with_progress_bar(self, url, local_filename): r = requests.get(url, stream=True) total_size = int(r.headers['Content-Length'].strip()) current_size = 0 #request_content = [] with fopen(local_filename, 'wb') as f: for buf in r.iter_content(4096 * 16): if buf: #request_content.append(buf) f.write(buf) current_size += len(buf) done = int(40 * current_size / total_size) sys.stdout.write( "\r[%s%s] %3.1f%%, downloading %.2f/%.2f MB ..." % ('=' * done, ' ' * (40 - done), 100 * current_size / total_size, current_size / 1024 / 1024, total_size / 1024 / 1024)) sys.stdout.flush()
def save(self, filename): f = fopen(filename, "w") f.write("LABELS " + str(len(self.label2int)) + "\n") for label in self.label2int: f.write(str(label) + "\t" + str(self.label2int[label]) + "\n") f.write("CHARACTERS " + str(len(self.char2int)) + "\n") for character in self.char2int: f.write(character + "\t" + str(self.char2int[character]) + "\n") f.write("WORDS " + str(len(self.word2int)) + "\n") for word in self.word2int: f.write(word + "\t" + str(self.word2int[word]) + "\n") f.write("UPOS " + str(len(self.upos2int)) + "\n") for label in self.upos2int: f.write(label + "\t" + str(self.upos2int[label]) + "\n") f.write("XPOS " + str(len(self.xpos2int)) + "\n") for label in self.xpos2int: f.write(label + "\t" + str(self.xpos2int[label]) + "\n") f.write("ATTRS " + str(len(self.attrs2int)) + "\n") for label in self.attrs2int: f.write(label + "\t" + str(self.attrs2int[label]) + "\n") f.close()
def save(self, filename): """Save configuration to file.""" sorted_dict = collections.OrderedDict(sorted( self.__dict__.items())) # sort dictionary if sys.version_info[0] == 2: config = ConfigParser.ConfigParser() else: config = configparser.ConfigParser() config.add_section(self.__config__) # write header if sys.version_info[0] == 2: items = sorted_dict.iteritems() else: items = sorted_dict.items() for k, v in items: # for python3 use .items() if not k.startswith("_"): # write only non-private properties if isinstance(v, float): # if we are dealing with a float str_v = str(v) if "e" not in str_v and "." not in str_v: # stop possible confusion with an int by appending a ".0" v = str_v + ".0" v = str(v) config.set(self.__config__, k, v) with fopen(filename, 'w') as cfgfile: config.write(cfgfile)
def load(self, filename): # We only read character2int, labels, holistic words and label2int here. word_list should be recomputed for every dataset (if deemed necessary) with fopen(filename, "r") as f: line = f.readline() num_labels = int(line.split(" ")[1]) if self.verbose: print("Loading labels " + str(num_labels)) self.labels = [""] * num_labels for _ in range(num_labels): line = f.readline() parts = line.split("\t") key = parts[0] value = int(parts[1]) self.label2int[key] = value self.labels[value] = key line = f.readline() num_characters = int(line.split(" ")[1]) self.characters = [""] * num_characters if self.verbose: print("Loading characters " + str(num_characters)) for _ in range(num_characters): line = f.readline() parts = line.split("\t") key = parts[0] value = int(parts[1]) self.char2int[key] = value self.characters[value] = key line = f.readline() num_words = int(line.split(" ")[1]) if self.verbose: print("Loading words " + str(num_words)) for _x in range(num_words): line = f.readline() parts = line.split("\t") key = parts[0] value = int(parts[1]) self.word2int[key] = value # morphological attributes line = f.readline() num_labels = int(line.split(" ")[1]) if self.verbose: print("Loading upos " + str(num_labels)) self.upos_list = [""] * num_labels for _ in range(num_labels): line = f.readline() parts = line.split("\t") key = parts[0] value = int(parts[1]) self.upos2int[key] = value self.upos_list[value] = key line = f.readline() num_labels = int(line.split(" ")[1]) self.xpos_list = [""] * num_labels if self.verbose: print("Loading xpos " + str(num_labels)) for _ in range(num_labels): line = f.readline() parts = line.split("\t") key = parts[0] value = int(parts[1]) self.xpos2int[key] = value self.xpos_list[value] = key line = f.readline() num_labels = int(line.split(" ")[1]) self.attrs_list = [""] * num_labels if self.verbose: print("Loading attrs " + str(num_labels)) for _ in range(num_labels): line = f.readline() parts = line.split("\t") key = parts[0] value = int(parts[1]) self.attrs2int[key] = value self.attrs_list[value] = key f.close()