Esempio n. 1
0
File: mt.py Progetto: adobe/NLP-Cube
 def __init__(self, src, dst):
     sys.stdout.write("Reading files '" + src + "' and '" + dst + "'")
     sys.stdout.flush()
     source_lines = fopen(src, "r").readlines()
     destination_lines = fopen(dst, "r").readlines()
     self.sequences = self._make_sequences(source_lines, destination_lines)
     sys.stdout.write(" found " + str(len(self.sequences)) + " pairs\n")
Esempio n. 2
0
    def save(self, filename):
        f = fopen(filename, "w")
        f.write("LABELS " + str(len(self.label2int)) + "\n")
        for label in self.label2int:
            f.write(str(label) + "\t" + str(self.label2int[label]) + "\n")
        f.write("CHARACTERS " + str(len(self.char2int)) + "\n")
        for character in self.char2int:
            if sys.version_info[0] == 2:
                f.write(
                    character.encode('utf-8') + "\t" +
                    str(self.char2int[character]) + "\n")
            else:
                f.write(character + "\t" + str(self.char2int[character]) +
                        "\n")
        f.write("WORDS " + str(len(self.word2int)) + "\n")
        for word in self.word2int:
            if sys.version_info[0] == 2:
                f.write(
                    word.encode('utf-8') + "\t" + str(self.word2int[word]) +
                    "\n")
            else:
                f.write(word + "\t" + str(self.word2int[word]) + "\n")

        f.write("UPOS " + str(len(self.upos2int)) + "\n")
        for label in self.upos2int:
            f.write(label + "\t" + str(self.upos2int[label]) + "\n")
        f.write("XPOS " + str(len(self.xpos2int)) + "\n")
        for label in self.xpos2int:
            f.write(label + "\t" + str(self.xpos2int[label]) + "\n")
        f.write("ATTRS " + str(len(self.attrs2int)) + "\n")
        for label in self.attrs2int:
            f.write(label + "\t" + str(self.attrs2int[label]) + "\n")
        f.close()
Esempio n. 3
0
 def write(self, filename):
     with fopen(filename, 'w') as file:
         for sequence in self.sequences:
             for entry in sequence:
                 file.write(str(entry))
                 file.write("\n")
             file.write("\n")
Esempio n. 4
0
 def __init__(self, file=None):
     if file is not None:
         sys.stdout.write("Reading " + file + "... ")
         sys.stdout.flush()
         with fopen(file, "r") as f:
             lines = f.readlines()
             
         self.sequences = self._make_sequences(lines)
         sys.stdout.write("found " + str(len(self.sequences)) + " sequences\n")
Esempio n. 5
0
    def read_from_file(self, word_embeddings_file, word_list, full_load=False):
        self.word2vec = {}
        self.num_embeddings = 0
        if word_list is None and not full_load:
            self.cache_only = True
        f = fopen(word_embeddings_file, "r")
        first_line = True
        while True:
            ofs = f.tell()
            line = f.readline()
            if line == '':
                break
                # print ofs
            line = line.replace("\n", "").replace("\r", "")
            if first_line:
                first_line = False
            else:
                self.num_embeddings += 1
                if self.verbose:
                    if self.num_embeddings % 10000 == 0:
                        sys.stdout.write("  Scanned " +
                                         str(self.num_embeddings) +
                                         " word embeddings and added " +
                                         str(len(self.word2vec)) + "  \n")
                parts = line.split(" ")
                if sys.version_info[0] == 2:
                    word = parts[0].decode('utf-8')
                else:
                    word = parts[0]
                if self.cache_only:
                    self.word2ofs[word] = ofs
                elif full_load or word in word_list:
                    embeddings = [float(0)] * (len(parts) - 2)

                    for zz in range(len(parts) - 2):
                        embeddings[zz] = float(parts[zz + 1])
                    self.word2vec[word] = embeddings
                self.word_embeddings_size = len(parts) - 2
        f.close()
        if self.cache_only:
            self.file_pointer = fopen(word_embeddings_file, "r")
Esempio n. 6
0
    def load_language(self, file, lang_id, ignore_compound=False):
        sys.stdout.write("Reading " + file + "... ")
        sys.stdout.flush()
        with fopen(file, "r") as f:
            lines = f.readlines()

        ns = self._make_sequences(lines,
                                  lang_id=lang_id,
                                  ignore_compound=ignore_compound)
        for [seq, l_id] in ns:
            self.sequences.append([seq, l_id])
        sys.stdout.write("found " + str(len(ns)) + " sequences\n")
Esempio n. 7
0
 def save(self, filename):
     assert (filename.endswith("metadata.json"))
     obj = {}
     obj["language"] = self.language
     obj["language_code"] = self.language_code
     obj["model_version"] = self.model_version
     obj["embeddings_remote_link"] = self.embeddings_remote_link
     obj["embeddings_file_name"] = self.embeddings_file_name
     obj["token_delimiter"] = self.token_delimiter
     obj["model_build_date"] = self.model_build_date
     obj["model_build_source"] = self.model_build_source
     obj["notes"] = self.notes
     json.dump(obj, fopen(filename, "w"), indent=4, sort_keys=True)
Esempio n. 8
0
 def read(self, filename):
     assert (filename.endswith("metadata.json"))
     data = json.load(fopen(filename, "r"))
     if sys.version_info[0] == 2:
         items = data.iteritems()
     else:
         items = data.items()
     for key, value in items:
         if key == "model_version":  # safety check to keep the version as float
             if isinstance(value, str):
                 self.__dict__[key] = float(value)
         else:
             self.__dict__[key] = value
Esempio n. 9
0
 def load_dict(self, path):
     #print ("Loading lemma dictionary")
     with fopen(path, "r") as f:
         lines = f.readlines()
         for line in lines:
             parts = line.strip().split('\t')
             if len(parts) == 5:
                 if sys.version_info[0] == 2:
                     word = unicode(parts[0],
                                    'utf-8').lower().encode('utf-8')
                 else:
                     word = parts[0].lower()
                 upos = parts[1]
                 key = word + '\t' + upos
                 self.word2lemma[key] = parts[4]
Esempio n. 10
0
 def read(self, filename):
     if not os.path.exists(filename):
         raise Exception("Metadata file [" + filename + "] not found!")
     if not filename.endswith("metadata.json"):
         raise Exception("Metadata file [" + filename +
                         "] does not seem to be valid!")
     with fopen(filename, "r") as f:
         data = json.load(f)
     if sys.version_info[0] == 2:
         items = data.iteritems()
     else:
         items = data.items()
     for key, value in items:
         if key == "model_version":  # safety check to keep the version as float
             if isinstance(value, str):
                 self.__dict__[key] = float(value)
         else:
             self.__dict__[key] = value
Esempio n. 11
0
 def _download_with_progress_bar(self, url, local_filename):
     r = requests.get(url, stream=True)
     total_size = int(r.headers['Content-Length'].strip())
     current_size = 0
     #request_content = []
     with fopen(local_filename, 'wb') as f:
         for buf in r.iter_content(4096 * 16):
             if buf:
                 #request_content.append(buf)
                 f.write(buf)
                 current_size += len(buf)
                 done = int(40 * current_size / total_size)
                 sys.stdout.write(
                     "\r[%s%s] %3.1f%%, downloading %.2f/%.2f MB ..." %
                     ('=' * done, ' ' *
                      (40 - done), 100 * current_size / total_size,
                      current_size / 1024 / 1024, total_size / 1024 / 1024))
                 sys.stdout.flush()
Esempio n. 12
0
    def save(self, filename):
        f = fopen(filename, "w")
        f.write("LABELS " + str(len(self.label2int)) + "\n")
        for label in self.label2int:
            f.write(str(label) + "\t" + str(self.label2int[label]) + "\n")
        f.write("CHARACTERS " + str(len(self.char2int)) + "\n")
        for character in self.char2int:
            f.write(character + "\t" + str(self.char2int[character]) + "\n")
        f.write("WORDS " + str(len(self.word2int)) + "\n")
        for word in self.word2int:
            f.write(word + "\t" + str(self.word2int[word]) + "\n")

        f.write("UPOS " + str(len(self.upos2int)) + "\n")
        for label in self.upos2int:
            f.write(label + "\t" + str(self.upos2int[label]) + "\n")
        f.write("XPOS " + str(len(self.xpos2int)) + "\n")
        for label in self.xpos2int:
            f.write(label + "\t" + str(self.xpos2int[label]) + "\n")
        f.write("ATTRS " + str(len(self.attrs2int)) + "\n")
        for label in self.attrs2int:
            f.write(label + "\t" + str(self.attrs2int[label]) + "\n")
        f.close()
Esempio n. 13
0
 def save(self, filename):
     """Save configuration to file."""
     sorted_dict = collections.OrderedDict(sorted(
         self.__dict__.items()))  # sort dictionary
     if sys.version_info[0] == 2:
         config = ConfigParser.ConfigParser()
     else:
         config = configparser.ConfigParser()
     config.add_section(self.__config__)  # write header
     if sys.version_info[0] == 2:
         items = sorted_dict.iteritems()
     else:
         items = sorted_dict.items()
     for k, v in items:  # for python3 use .items()
         if not k.startswith("_"):  # write only non-private properties
             if isinstance(v, float):  # if we are dealing with a float
                 str_v = str(v)
                 if "e" not in str_v and "." not in str_v:  # stop possible confusion with an int by appending a ".0"
                     v = str_v + ".0"
             v = str(v)
             config.set(self.__config__, k, v)
     with fopen(filename, 'w') as cfgfile:
         config.write(cfgfile)
Esempio n. 14
0
    def load(self, filename):
        # We only read character2int, labels, holistic words and label2int here. word_list should be recomputed for every dataset (if deemed necessary)
        with fopen(filename, "r") as f:
            line = f.readline()

            num_labels = int(line.split(" ")[1])
            if self.verbose:
                print("Loading labels " + str(num_labels))
            self.labels = [""] * num_labels
            for _ in range(num_labels):
                line = f.readline()
                parts = line.split("\t")
                key = parts[0]
                value = int(parts[1])
                self.label2int[key] = value
                self.labels[value] = key

            line = f.readline()
            num_characters = int(line.split(" ")[1])
            self.characters = [""] * num_characters
            if self.verbose:
                print("Loading characters " + str(num_characters))
            for _ in range(num_characters):
                line = f.readline()
                parts = line.split("\t")
                key = parts[0]
                value = int(parts[1])
                self.char2int[key] = value
                self.characters[value] = key
            line = f.readline()
            num_words = int(line.split(" ")[1])
            if self.verbose:
                print("Loading words " + str(num_words))
            for _x in range(num_words):
                line = f.readline()
                parts = line.split("\t")
                key = parts[0]
                value = int(parts[1])
                self.word2int[key] = value

            # morphological attributes
            line = f.readline()
            num_labels = int(line.split(" ")[1])
            if self.verbose:
                print("Loading upos " + str(num_labels))
            self.upos_list = [""] * num_labels
            for _ in range(num_labels):
                line = f.readline()
                parts = line.split("\t")
                key = parts[0]
                value = int(parts[1])
                self.upos2int[key] = value
                self.upos_list[value] = key

            line = f.readline()
            num_labels = int(line.split(" ")[1])
            self.xpos_list = [""] * num_labels
            if self.verbose:
                print("Loading xpos " + str(num_labels))
            for _ in range(num_labels):
                line = f.readline()
                parts = line.split("\t")
                key = parts[0]
                value = int(parts[1])
                self.xpos2int[key] = value
                self.xpos_list[value] = key

            line = f.readline()
            num_labels = int(line.split(" ")[1])
            self.attrs_list = [""] * num_labels
            if self.verbose:
                print("Loading attrs " + str(num_labels))
            for _ in range(num_labels):
                line = f.readline()
                parts = line.split("\t")
                key = parts[0]
                value = int(parts[1])
                self.attrs2int[key] = value
                self.attrs_list[value] = key
            f.close()