def fact_stop_word_filter(stop_words_file): stop_words = load_stop_words(stop_words_file) def stp_flter(tokens): return [tok for tok in tokens if tok.lower() not in stop_words] return stp_flter
def fact_case_sensitive_stop_word_filter(stop_words_file): stop_words = load_stop_words(stop_words_file) def cs_stop_filter(tokens): return [tok for tok in tokens if tok not in stop_words] return cs_stop_filter
lbl, sim = kwd2cluster_sims[kword][0] f.write("%s=>%s\n" % (kword, cluster_label(lbl))) """ Extract Clustered Synonyms """ if len(sys.argv) != 2: raise Exception("Incorrect number of arguments passed - one expected, the config file name") config = GenerateClusterSynonymsConfig(sys.argv[1]) model = Word2Vec.load(config.model_file) print("Word2Vec model loaded") keywords = set() for file in config.keywords_files: keywords.update(load_stop_words(file)) print("%i keywords loaded" % (len(keywords))) id2kwd = dict() kwd2id = dict() vectors = [] for term in keywords: id2kwd[len(vectors)] = term kwd2id[term] = len(vectors) vec = get_norm_vector(term, model) if vec is not None: vectors.append(vec) start = time.time() # don't parallelize (n_jobs = -1), doesn't seem to work
pyld_f.write("%s|%f " %(kw,val)) pyld_f.write("\n") else: no_sim.add(word) #print("No matching similar terms in word2vec model for term: %s" % word) with open(synonym_file, "w+") as f: for syn in sorted(processed_syns): f.write("%s=>%s\n" % (syn, map_keyword(syn))) #Returned for analysis - do something with this if you need to investigate return missing, no_sim, processed_syns """ Generate Synonym Files """ if len(sys.argv) != 2: raise Exception("Incorrect number of arguments passed - one expected, the config file name") config = GenerateTopNSynonymsConfig(sys.argv[1]) start = time.time() model = Word2Vec.load(config.model_file) print("Word2Vec model loaded") keywords = set() for file in config.keywords_files: keywords.update(load_stop_words(file)) print("%i keywords loaded" % (len(keywords))) missing, no_sim, processed_syns = write_most_similar_synonyms(config.top_n, keywords, model, config.payload_synonyms_file, config.synonyms_file) print "%s synonyms processed" % (len(processed_syns)) end = time.time() print "Took %s seconds" % (end - start)
""" Extract Phrases """ import sys from Config.extract_keywords_config import ExtractKeywordsConfig if len(sys.argv) != 2: raise Exception( "Incorrect number of arguments passed - one expected, the config file name" ) #sys.argv[0] is this script file, sys.argv[1] should be the config file config = ExtractKeywordsConfig(sys.argv[1]) script_start = time.time() if config.stop_words_file: stop_words = load_stop_words(config.stop_words_file) print("%i stop words loaded" % len(stop_words)) else: stop_words = set() """ Load Documents """ start = time.time() files = find_files(config.processed_documents_folder, config.file_mask, True) print("%s files found in %s" % (len(files), config.processed_documents_folder)) documents = [] for i, fname in enumerate(files): with open(fname) as f: contents = f.read() documents.append(contents.split("\n")) end = time.time() print("Loading %i documents took %s seconds" % (len(files), str(end - start))) """ Extract Common Terms and Phrases """
find_sub_phrases_to_remove(tpl_ngram, valid_phrases, doc_freq, to_rem) """ Extract Phrases """ import sys from Config.extract_keywords_config import ExtractKeywordsConfig if len(sys.argv) != 2: raise Exception("Incorrect number of arguments passed - one expected, the config file name") # sys.argv[0] is this script file, sys.argv[1] should be the config file config = ExtractKeywordsConfig(sys.argv[1]) script_start = time.time() if config.stop_words_file: stop_words = load_stop_words(config.stop_words_file) print ("%i stop words loaded" % len(stop_words)) else: stop_words = set() """ Load Documents """ start = time.time() files = find_files(config.processed_documents_folder, config.file_mask, True) print ("%s files found in %s" % (len(files), config.processed_documents_folder)) documents = [] for i, fname in enumerate(files): with open(fname) as f: contents = f.read() documents.append(contents.split("\n")) end = time.time() print ("Loading %i documents took %s seconds" % (len(files), str(end - start)))