def _process_sentence_tt(self, sentence, counter): """Process sentence with Treetagger""" tokens_pos_tagged = [] treetagger_tokens = self.treetagger.TagText(sentence) token_pos_tagged = None for token in treetagger_tokens: token_pos_tagged = token.split("\t") if len(token_pos_tagged) != 3: print >> stderr, "Caution -- broken TreeTagger case: ", token_pos_tagged, "(list)" continue # Skip it pos_tag = getTag(token_pos_tagged[1], lang_1) token = token_pos_tagged[2].lower() # Those cases we don't want. if ( not token in ["<unknown>", "@ord@", "@card@"] and not pos_tag == NO_POS_SYM and len(token) <= max_word_len ): token += "_" + pos_tag + "_" + self.lang tokens_pos_tagged.append(token) self.sentences[counter] = tokens_pos_tagged
def main(): global input_is_tokenized, use_lemmatization, space_cols_file, \ loaded_space_file_s, loaded_space_file_t, source_lang, \ target_lang, input_file, output_file, tag_cutoff, \ no_stopword_print, number_of_translations, \ number_of_neighbours, different_pos_punishment, \ treetagger_path parser = argparse.ArgumentParser(description="Word translations" + \ " that fit best to the sentence") parser.add_argument("-k", "--tokenized", help="use pretokenized input", action="store_true") parser.add_argument("-l", "--lemmatized", help="use lemmatization", action="store_true") parser.add_argument("-p", "--returntag", help="return language tag", action="store_true") parser.add_argument("-d", "--dimensions", type=str, help="column file for the input matrix") parser.add_argument("-m", "--sourcematrix", type=str, help="pickled input matrix for source language") parser.add_argument("-y", "--targetmatrix", type=str, help="pickled input matrix for target language") parser.add_argument("-s", "--sourcelang", type=str, help="input language") parser.add_argument("-t", "--targetlang", type=str, help="output language") parser.add_argument("-i", "--infile", type=str, help="input file") parser.add_argument("-o", "--outfile", type=str, help="output file") parser.add_argument("-nsp", "--no-stopword-print", action="store_true", help="Omit to print words without candidates -- usually " + \ "stop words.") parser.add_argument("-nt", "--number-of-translations", type=float, help="The number of candidates to show for each input word.") parser.add_argument("-nn", "--number-of-neighbours", type=int, help="The number of neighbours for each input word to " + \ "consider in the similarity space constructed.") parser.add_argument("-dpp", "--different-pos-punishment", type=float, help="The score's fraction to punish a " + \ "candidate word which is there, but " + \ "has not the same POS as its input peer.") parser.add_argument args = parser.parse_args() if args.sourcelang: source_lang = args.sourcelang if args.targetlang: target_lang = args.targetlang if args.tokenized: input_is_tokenized = True if args.lemmatized: use_lemmatization = True if args.dimensions: space_cols_file = args.dimensions elif source_lang == target_lang: space_cols_file = DATA_DIR_OUT + source_lang + '-words.col' else: space_cols_file = DATA_DIR_OUT \ + '_'.join(sorted([source_lang,target_lang])) \ + '-words.col' if args.sourcematrix: loaded_space_file_s = args.sourcematrix elif source_lang == target_lang: loaded_space_file_s = DATA_DIR_OUT + source_lang + '.pkl' else: loaded_space_file_s = DATA_DIR_OUT + source_lang \ + '_' + source_lang + '-' + target_lang \ + '.pkl' if args.targetmatrix: loaded_space_file_t = args.targetmatrix elif source_lang == target_lang and loaded_space_file_t == "": loaded_space_file_t = DATA_DIR_OUT + target_lang + '.pkl' else: loaded_space_file_t = DATA_DIR_OUT + target_lang \ + '_' + target_lang + '-' + source_lang \ + '.pkl' if args.infile: input_file = open(args.infile, "r") if args.outfile: output_file = open(args.outfile, "w") if args.returntag: tag_cutoff = 0 else: if args.lemmatized: tag_cutoff = 5 else: tag_cutoff = 3 if args.no_stopword_print: no_stopword_print = args.no_stopword_print # vector dimension/columns for input matrix and matrix per sentence space_cols_fileobject = open(space_cols_file, "r") # space_cols = space_cols_fileobject.readlines() space_cols = space_cols_fileobject.read().split("\n")[:-1] space_cols_fileobject.close() # load the space loaded_space = {} loaded_space[source_lang] = io_utils.load(loaded_space_file_s) # only load it once for similary queries in the same language if not loaded_space.get(target_lang): loaded_space[target_lang] = io_utils.load(loaded_space_file_t) # Initialize TreeTagger only once (for later use) treetagger = TreeTagger(TAGLANG=source_lang, TAGDIR=treetagger_path, TAGINENC=ENC, TAGOUTENC=ENC) # work on input file while True: line = input_file.readline() words = [] # words in sentence lemmas = [] # lemmas in sentence pos = [] # part-of-speech tags per word in sentence formatted = [] # matrix for sentence freq = defaultdict(lambda: defaultdict(int)) # Stop when file is entirely read if not line: break # For pre-treetagged text if input_is_tokenized: while not re.match(r'[.:?!]', line): t = line.rstrip() w = t.split("\t")[0] p = helpers.getTag(t.split("\t")[1], source_lang) l = t.split("\t")[2] words.append(w) lemmas.append(l) pos.append(p) formatted.append(helpers.dimensionformat(w, p, l, source_lang, use_lemmatization)) line = input_file.readline() if not line: break # Use tree-tagger as lemmatizer and/or tokenizer else: treetagger_sentence = treetagger.TagText(line) for t in treetagger_sentence: try: w = t.split("\t")[0] p = helpers.getTag(t.split("\t")[1], source_lang) l = t.split("\t")[2] except: print >> sys.stderr, \ "Caution: TreeTagger token cannot " + \ "be processed:", t continue # Skip it words.append(w) lemmas.append(l) pos.append(p) formatted.append(helpers.dimensionformat(w, p, l, source_lang, use_lemmatization)) # fill matrix for sentence for i in formatted: for j in formatted: freq[i][j] += 1 # bild unique list of the words in this sentence for the rows uniqwords = set() for l in formatted: uniqwords.add(l) query_rows = list(uniqwords) # rows for sentence matrix # dissect compatible matrix m = np.mat(np.zeros(shape=(len(query_rows), len(space_cols)))) # convert sentence matrix to compatible matrix for i in range(len(query_rows)): for j in range(len(space_cols)): m[i, j] = freq[query_rows[i]][space_cols[j]] # build dissect matrix query_space = Space(DenseMatrix(m), query_rows, space_cols) # for every word print neighbours with similarity for i in range(len(words)): best_translations = get_best_translations(words[i], pos[i], lemmas[i], query_space, loaded_space) output_file.write(format_best_translations(words[i], pos[i], lemmas[i], best_translations)) if input_is_tokenized: output_file.write(line.split("\t")[0] + "\n") if args.infile: input_file.close() if args.outfile: output_file.close()