def get_udpipe_model(lang): """ Static ref to a UDPipe model. The actual models are in the data directory. """ from ufal.udpipe import Model, Pipeline res_path = Resources.get_resources_dir() if lang == "en": if Resources.en_model is None: Resources.en_model = Model.load(os.path.join(res_path, "english-ud-2.1-20180111.udpipe")) if Resources.en_model is None: raise Exception("Failed to load the English UDPipe model.") return Resources.en_model elif lang == "de": if Resources.en_model is None: Resources.en_model = Model.load(os.path.join(res_path, "german-ud-2.0-170801.udpipe")) if Resources.en_model is None: raise Exception("Failed to load the German UDPipe model.") return Resources.en_model elif lang == "fr": if Resources.fr_model is None: Resources.fr_model = Model.load(os.path.join(res_path, "french-sequoia-ud-2.1-20180111.udpipe")) if Resources.fr_model is None: raise Exception("Failed to load the French UDPipe model.") return Resources.fr_model elif lang == "nl": if Resources.nl_model is None: Resources.nl_model = Model.load(os.path.join(res_path, "dutch-ud-2.1-20180111.udpipe")) if Resources.nl_model is None: raise Exception("Failed to load the Dutch UDPipe model.") return Resources.nl_model else: raise Exception(f"Language '{lang}' is not supported.")
def __init__(self, task='parse', model=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'hungarian-szeged-ud-2.5-191206.udpipe'), source_fields=None, target_fields=None): # Download model: # - UDv1.2 https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1659 # - UDv2.4 https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2898 # - UDv2.5 https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131 self._model = Model.load(model) # Load this if self._model is None: raise UDPipeError('ERROR:Loading modelfile {0}'.format(model)) available_tasks = {'tok': self._setup_tok, 'pos': self._setup_pos, 'parse': self._setup_parse, 'tok-pos': self._setup_tok_pos, 'tok-parse': self._setup_tok_parse, 'pos-parse': self._setup_pos_parse} for keyword, key_fun in available_tasks.items(): if task == keyword: key_fun() # Do setup! self._task = task # Store for later break else: raise ValueError('No proper task is specified. The available tasks are {0}'. format(' or '.join(available_tasks.keys()))) # Field names for xtsv (the code below is mandatory for an xtsv module) if source_fields is None: source_fields = set() if target_fields is None: target_fields = [] self.source_fields = source_fields self.target_fields = target_fields
def tokenize_and_tag_texts(dict_texts): eng_model = Model.load('english-partut-ud-2.5-191206.udpipe') fr_model = Model.load('french-partut-ud-2.5-191206.udpipe') eng_pipeline = Pipeline(eng_model, 'generic_tokenizer', '', '', '') fr_pipeline = Pipeline(fr_model, 'generic_tokenizer', '', '', '') for language_key, primal_texts in dict_texts.items(): tokenized_tagged_eng_text = eng_pipeline.process(primal_texts[1]) tokenized_tagged_fr_text = fr_pipeline.process(primal_texts[2]) dict_tokenized_tagged_texts = { 'eng': tokenized_tagged_eng_text, 'fr': tokenized_tagged_fr_text } # print(tokenized_tagged_eng_text) # print(tokenized_tagged_fr_text) # print(dict_tokenized_tagged_texts) return dict_tokenized_tagged_texts
def run(model_file, text_file): print('Loading model...') model = Model.load(model_file) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') print('Reading corpus...') with open(text_file) as f: text = f.read() print('Analyzing text...') processed = pipeline.process(text) print('Extracting phrases...') phrases = [] sent = [] for line in tqdm((processed + '#').splitlines()): if line.startswith('#') and len(sent): preps = get_preps(sent) for prep, dep_id in preps.values(): pphrase = get_phrase(prep, dep_id, sent) phrases.append(pphrase) sent.clear() elif len(line) > 1: try: sent.append(Token(line.split('\t'))) except ValueError: continue print('Done!') return phrases
def _tokenize(self, text='Текст нужно передать функции в виде строки!'): from utils import lemmatize if not self.udpipe_model: udpipe_model_path = os.path.join(BASE_DIR, 'model', 'udpipe_syntagrus.model') if not os.path.isfile(udpipe_model_path): msg = 'UDPipe model not found!' logging.critical(msg) raise IOError(msg) self.udpipe_model = Model.load(udpipe_model_path) t = time() process_pipeline = Pipeline(self.udpipe_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') result = [] for line in nltk.sent_tokenize(text): # line = unify_sym(line.strip()) # здесь могла бы быть ваша функция очистки текста output = lemmatize(process_pipeline, text=line) result.extend(output) self.tagged_counter += 1 log(f'{self.tagged_counter} of {self.tagged_max} created, for {round(time() - t, 2)}s' ) return result
def __init__(self, lang): """Load UDPipe model for given language. lang (unicode): ISO 639-1 language code or shorthand UDPipe model name. RETURNS (UDPipeModel): Language specific UDPipeModel. """ path = get_path(lang) self.model = Model.load(path) if not self.model: msg = "Cannot load UDPipe model from " \ "file '{}'".format(path) raise Exception(msg) self._lang = lang.split('-')[0] self._meta = { 'authors': ("Milan Straka, " "Jana Straková"), 'description': "UDPipe pretrained model.", 'email': '*****@*****.**', 'lang': 'udpipe_' + self._lang, 'license': 'CC BY-NC-SA 4.0', 'name': path.split('/')[-1], 'parent_package': 'spacy_udpipe', 'pipeline': 'Tokenizer, POS Tagger, Lemmatizer, Parser', 'source': 'Universal Dependencies 2.4', 'url': 'http://ufal.mff.cuni.cz/udpipe', 'version': '1.2.0' }
def udpipeG(pathmodel): path = "/home/guido/Progetto Unitexto/textdata/cleanedTxt/" model = Model.load(pathmodel) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel) sys.exit(1) sys.stderr.write('done\n') pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() # corp = io.open("/home/guido/Progetto Unitexto/textdata/corpus.txt","r",encoding= "utf-8") # Read whole input # string="".join(corp.readlines()) # Process data # processed = pipeline.process(string, error) f = open("/home/guido/Progetto Unitexto/textdata/corpus.conllu", "a") f.truncate(0) i = 1 for filename in os.listdir(path): text = io.open(path + filename, "r", encoding="utf-8") string = "".join(text.readlines()) # Process data processed = pipeline.process(string, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) f.write(processed) print("File n ", i, " processed of ", len(os.listdir(path))) i += 1
def parse(text, sentence_id): """Takes a sentence in raw text and produces its CoNLL-U annotation by invoking udpipe Paratemeters: text - the sentence to be parsed sentence_id - the ID of the sentence Output: a UD graph """ model = Model.load('./models/udpipe/english-ewt-ud-2.3-181115.udpipe') tokenizer = model.newTokenizer(model.TOKENIZER_PRESEGMENTED) # tokenizer = model.TOKENIZER_PRESEGMENTED(model.DEFAULT) conlluOutput = OutputFormat.newOutputFormat("conllu") sentence = Sentence() error = ProcessingError() tokenizer.setText(text) tokenizer.nextSentence(sentence, error) model.tag(sentence, model.DEFAULT) model.parse(sentence, model.DEFAULT) return conlluOutput.writeSentence(sentence).replace( '# sent_id = 1', '# sent_id = ' + sentence_id)
def udpipe(sentences, model_name, verbose=False): """ Parse text to Universal Dependencies using UDPipe. :param sentences: iterable of iterables of strings (one string per line) :param model_name: filename containing UDPipe model to load :param verbose: print extra information :return: iterable of lines containing parsed output """ from ufal.udpipe import Model, Pipeline, ProcessingError model = Model.load(model_name) if not model: raise ValueError("Invalid model: '%s'" % model_name) pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") lines1, lines2 = tee(l for s in sentences for l in s) text = "\n".join(lines1) error = ProcessingError() num_tokens = sum(1 for l in lines2 if l) with ioutil.external_write_mode(): print("Running %s on %d tokens... " % (model_name, num_tokens), end="", flush=True) start = time() processed = pipeline.process(text, error) duration = time() - start with ioutil.external_write_mode(): print("Done (%.3fs, %.0f tokens/s)" % (duration, num_tokens / duration if duration else 0)) if verbose: print(processed) if error.occurred(): raise RuntimeError(error.message) return processed.splitlines()
def parse_file(text): from ufal.udpipe import Model, Pipeline model_path = MODELS_DIR + MODEL_NAMES['russian'] model = Model.load(model_path) pipeline = Pipeline(model, 'tokenizer=ranges', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') return process_udpipe(text, pipeline)
def udpipeS(pathmodel, sourcepath, pathdestination): model = Model.load(pathmodel) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel) sys.exit(1) sys.stderr.write('done\n') pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() i = 1 for filename in os.listdir(sourcepath): f = open(pathdestination + filename[:-3] + "conllu", "a") f.truncate(0) text = io.open(sourcepath + filename, "r", encoding="utf-8") string = "".join(text.readlines()) # Process data processed = pipeline.process(string, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) f.write(processed) f.close() print("File n ", i, " processed of ", len(os.listdir(sourcepath))) i += 1
def load_file(self, name, filename, lang): if filename not in tronco_special_files: filename_dir = os.path.join(root_path, "corpora", name, filename) if not lang in self.models: self.models[lang] = Model.load( os.path.join(root_path, "udpipe", udpipe_models[lang]['path'])) pipeline = Pipeline(self.models[lang], "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") with open(filename_dir) as f: try: text = f.read().splitlines() except: return False raw_text = [] metadata = {'filename': filename} [ metadata.update({ x.split(" = ", 1)[0].split("# ", 1)[1]: x.split(" = ", 1)[1] }) if x.strip().startswith("# ") and " = " in x else raw_text.append(x) for x in text ] if not name in self.files: self.files[name] = {} self.files[name][filename] = pipeline.process( "\n".join(raw_text)).replace("# newdoc\n", "").replace("# newpar\n", "") if not name in self.metadata: self.metadata[name] = {} self.metadata[name][filename] = metadata
def __init__(self, lang: str, path: Optional[str] = None, meta: Optional[Dict] = None): """Load UDPipe model for given language. lang: ISO 639-1 language code or shorthand UDPipe model name. path: Path to UDPipe model. meta: Meta-information about the UDPipe model. """ path = path or get_path(lang=lang) self.model = Model.load(path) self._lang = lang.split("-")[0] self._meta = meta or { "author": "Milan Straka & Jana Straková", "description": "UDPipe pretrained model.", "email": "*****@*****.**", "lang": f"udpipe_{self._lang}", "license": "CC BY-NC-SA 4.0", "name": path.split("/")[-1], "parent_package": "spacy_udpipe", "pipeline": ["Tokenizer", "Tagger", "Lemmatizer", "Parser"], "source": "Universal Dependencies 2.5", "url": "http://ufal.mff.cuni.cz/udpipe", "version": "1.2.0" }
def tag_ud(text='Текст нужно передать функции в виде строки!', modelfile='udpipe_syntagrus.model'): model = Model.load(modelfile) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') processed = pipeline.process( text) # обрабатываем текст, получаем результат в формате conllu output = [l for l in processed.split('\n') if not l.startswith('#') ] # пропускаем строки со служебной информацией tagged = [ w.split('\t')[2].lower() + '_' + w.split('\t')[3] for w in output if w ] # извлекаем из обработанного текста лемму и тэг # tagged_propn = [] # propn = [] # for t in tagged: # if t.endswith('PROPN'): # if propn: # propn.append(t) # else: # propn = [t] # else: # if len(propn) > 1: # for x in propn: # #name = '::'.join([x.split('_')[0] for x in propn]) + '_PROPN' # tagged_propn.append(x) # elif len(propn) == 1: # tagged_propn.append(propn[0]) # tagged_propn.append(t) # propn = [] return tagged
def load_nli_data(path, snli=False, udpipe_path=None, seq_length=50, r=10, cache_file=''): """ Load MultiNLI or SNLI data. If the 'snli' parameter is set to True, a genre label of snli will be assigned to the data. """ global is_snli, pipeline, error, pr_seq_length, pr_r is_snli = snli pr_r = r pr_seq_length = seq_length pipeline = None print(path) if os.path.exists(cache_file): with open(cache_file, 'rb') as f: data = [w for w in pickle.load(f) if w is not None] else: if udpipe_path: model = Model.load(udpipe_path) pipeline = Pipeline(model, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') error = ProcessingError() with open(path) as f: pool = Pool(32) data = pool.map_async(process_line, list(f), chunksize=1) while not data.ready(): print('{} lines left'.format(data._number_left)) time.sleep(10) data = [w for w in data.get() if w is not None] pool.close() random.seed(1) random.shuffle(data) with open(cache_file, 'wb') as f: pickle.dump(data, f) return data
def process_task_stream(sentence): from ufal.udpipe import Model, Pipeline model_path = MODELS_DIR + MODEL_NAMES[ 'russian'] # language harcoded so far model = Model.load(model_path) pipeline = Pipeline(model, '', '', '', '') print('...loaded the model') return process_data(sentence, pipeline)
def __init__(self, model_filename: str) -> None: self.model = Model.load(model_filename) self.logger = logging.getLogger(self.__class__.__name__) if not self.model: raise Exception( f"Cannot load model from file \"{model_filename}\".") with open("resources/predicates.txt") as f: self.predicates = {pred.strip().lower() for pred in f}
def make_conll_with_udpipe(text): model_path = os.path.join(os.getcwd(), 'udparsers', 'russian-syntagrus-ud-2.5-191206.udpipe' ) # здесь указать путь к модели model = Model.load(model_path) pipeline = Pipeline(model, 'tokenizer=ranges', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') return pipeline.process(text)
def make_conll_with_udpipe(text, language='german'): if language == 'german': model_path = path.join('..', '..', 'udpipe', 'german-ud-2.0-170801.udpipe') model = Model.load(model_path) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') return pipeline.process(text)
def __init__(self, sourceForInf, sourceForDict, sourceForDL): self.nomberOfKnownWords = 0 self.modelForInfinitive = Model.load(sourceForInf) self.vectorOfStr = torch.zeros(2000) self.sourceForDL = sourceForDL self.net = Net(2000, 1000, 42) self.net.load_state_dict(torch.load(sourceForDL, map_location='cpu')) self.needToSearch = False self.boolClassify = False self.clasterOfWord = {} with open(sourceForDict) as json_file: self.clasterOfWord = json.load(json_file) self.idOfThemes = { 'авто/мото': 0, 'активный отдых': 1, 'бизнес': 2, 'домашние животные': 3, 'здоровье': 4, 'знакомство и общение': 5, 'игры': 6, 'ИТ (компьютеры и софт)': 7, 'кино': 8, 'красота и мода': 9, 'кулинария': 10, 'культура и искусство': 11, 'литература': 12, 'мобильная связь и интернет': 13, 'музыка': 14, 'наука и техника': 15, 'недвижимость': 16, 'новости и СМИ': 17, 'безопасность': 18, 'образование': 19, 'обустройство и ремонт': 20, 'политика': 21, 'продукты питания': 22, 'промышленность': 23, 'путешествия': 24, 'работа': 25, 'развлечения': 26, 'религия': 27, 'дом и семья': 28, 'спорт': 29, 'страхование': 30, 'телевидение': 31, 'товары и услуги': 32, 'увлечения и хобби': 33, 'финансы': 34, 'фото': 35, 'эзотерика': 36, 'электроника и бытовая техника': 37, 'эротика': 38, 'юмор': 39, 'общество, гуманитарные науки': 40, 'дизайн и графика': 41 }
def load_data(filename: str, lemmatize: bool, stopwords_path: str, manual_language=None) -> Tuple[List, List]: with open(filename, 'r') as f: citations = f.read().split('\n') stopwords = [] for stopwords_list in listdir(stopwords_path): with open(path.join(stopwords_path, stopwords_list), 'r') as f: stopwords.extend(f.read().split('\n')) # Download models here: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2998# en_model = Model.load( path.join('.', 'udpipe', 'english-ewt-ud-2.4-190531.udpipe')) en_pipeline = Pipeline(en_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') ru_model = Model.load( path.join('.', 'udpipe', 'russian-syntagrus-ud-2.4-190531.udpipe')) ru_pipeline = Pipeline(ru_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') pattern = re.compile('[^a-zа-яA-ZА-Я ]+') pattern_brackets = re.compile('[\(\[].*?[\)\]]') citations_ids = [] citations_texts = [] citation_id_position = 4 for citation in citations: try: citation_id = '_'.join(citation.split()[:citation_id_position]) except IndexError: print('Citation ID was not recognized for {}'.format(citation)) continue assert ':' in citation_id, 'Citation ID parsed incorrectly' normalized_citation = normalize_sentence(citation, pattern, pattern_brackets) pipeline = select_lang_pipeline(normalized_citation, en_pipeline, ru_pipeline, manual_language) if not pipeline: print('Language has not been detected for {}'.format(citation_id)) continue citations_ids.append(citation_id) citations_texts.append( preprocess(pipeline, normalized_citation, stopwords, lemmatize=lemmatize)) return citations_ids, citations_texts
def __init__(self): print('Loading model: ') model_path = r"D:\py_projects\IWonnaBook\udpipe_syntagrus.model" self.model = Model.load(model_path) if not self.model: print('Модель не загружена :(') sys.exit(1) self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") print('done\n')
def __init__(self, seed=42): self.morph = pymorphy2.MorphAnalyzer() self.model = Model.load("data/udpipe_syntagrus.model".encode()) self.process_pipeline = Pipeline(self.model, 'tokenize'.encode(), Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu'.encode()) self.seed = seed self.init_seed() self.paronyms = self.get_paronyms() self.freq_bigrams = self.open_freq_grams()
def load(self, model_path: str): logger.info("Loading UdPipe model ...") self.model = Model.load(model_path) if not self.model: raise Exception("Cannot load model from file '%s'." % model_path) self.tokenizer = self.model.newTokenizer(self.model.DEFAULT) if not self.tokenizer: raise Exception("The model does not have a tokenizer") self.error = ProcessingError()
def load_model(self): model_path = Parser.MODELS.get(self.language, None) if not model_path: raise ParserException("Cannot find model for language '%s'" % self.language) model = Model.load(model_path) if not model: raise ParserException("Cannot load model from file '%s'\n" % model_path) return model
def get(modelAdd, text): from ufal.udpipe import Model, Pipeline, ProcessingError error = ProcessingError() model = Model.load(modelAdd) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") parsedArticle = pipeline.process(text, error) return parsedArticle
def __init__(self, model): """Create the UDPipe tool object.""" self.model = model path = require_file(model) self.tool = Model.load(path) if not self.tool: raise IOError("Cannot load model from file '%s'" % path) self.error = ProcessingError() self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT)
def extract_sentences(input_file: str, output_file: str, logger) -> None: logger.info( "==== Now performing sentence extraction from paragraphs file ====") # UDPipe initliazation lang_model = 'lang_models/czech-ud-2.0-170801.udpipe' model = Model.load(lang_model) if not model: logger.error('Could not load UDPipe language model: ' + lang_model) ud_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, '') ud_error = ProcessingError() sentences_file = open(output_file, "w") # reopen paragraphs for reading paragraphs_file = open(input_file, "r") sentences_count = 0 for p_line in paragraphs_file: page_first_sentence = "" page_first_paragraph = p_line.split( '\t', 1) # use the variable as temporary list # If there is a paragraph content if len(page_first_paragraph) == 2: page_uri = page_first_paragraph[0] page_first_paragraph = page_first_paragraph[1] # Extract first sentence form paragraph using UDPipe: ud_output = ud_pipeline.process(page_first_paragraph, ud_error) if ud_error.occurred(): logger.error( 'Error occured while extracting sentence using UDPipe: ' + ud_error.message) page_first_sentence = "" else: ud_output = ud_output.split('\n') if len(ud_output) >= 4: page_first_sentence = ud_output[3][ 9:] # assumption about the output format else: page_first_sentence = "" # Write sentence to the file sentences_file.write(page_uri + '\t' + page_first_sentence + '\n') sentences_count += 1 if sentences_count % 2000 == 0: logger.info("Extracted {} sentences.".format(sentences_count)) logger.info("Finished extraction of {} sentences.".format(sentences_count)) paragraphs_file.close() sentences_file.close()
def get_pipeline(modelfile): print('\nLoading the model...', file=sys.stderr) if not os.path.isfile(modelfile): udpipe_model_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model' udpipe_filename = udpipe_model_url.split('/')[-1] print('UDPipe model not found. Downloading...', file=sys.stderr) wget.download(udpipe_model_url) ufal_model = Model.load(modelfile) process_pipeline = Pipeline(ufal_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') return process_pipeline
def init(self): if self.model is None: self.model = Model.load(self._model_path) if not self.model: sys.stderr.write('Cannot load model from file "%s"\n' % self._model_path) self.tagger = Pipeline.DEFAULT if self._enable_tagger else Pipeline.NONE self.parser = Pipeline.DEFAULT if self._enable_parser else Pipeline.NONE self.error = ProcessingError() self.converter_conll = ConverterConllUDV1()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d','--from_dir_prefix', type = str) parser.add_argument('-t','--to_dir_prefix', default = '/tmp/workers', type = str) parser.add_argument('-v','--to_vocab_file', default = '/tmp/workers/vocab.txt', type = str) parser.add_argument('-u','--path2udp_model', default = './russian-syntagrus-ud-2.0-170801.udpipe', type = str) parser.add_argument('-n','--cpu_n', default = 5, type = int) parser.add_argument('-T','--timeout_duration', default = 40*60, type = int) # timeout_duration = 40 min args = parser.parse_args() os.makedirs(args.to_dir_prefix, exist_ok=True) model = Model.load(args.path2udp_model) udpipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'horizontal') files = list(glob(args.from_dir_prefix)) word_counts = multipd.timeouted_run_pool(files,udpipeline,args.to_dir_prefix, cpu_n=args.cpu_n, timeout_duration=args.timeout_duration) word_count = sum(word_counts, collections.Counter()) vocab = [ '%s\n' % word for word, _ in word_count.most_common()] vocab.append(word_count.most_common()[-1][0]) open(args.to_vocab_file, 'wt').writelines(vocab) freqs = [ '{}\t{}\n'.format(word, freq) for word, freq in word_count.most_common()[:-1]] open(args.to_vocab_file+'.freqs', 'wt').writelines(freqs)