def main(): docs = utils.read_docs() recommend = RecommenderAlgorithm(docs) recommend.createLanguageList() recommend.createTopicList() recommend.printToFile('indexByTopics.json') recommend.printToFile('indexByLanguages.json') recommend.userInterface()
def train_model(docs_dir, model_file): docs = utils.read_docs(lambda str, f: TaggedDocument(utils.cut_and_remove_stopwords(str), [f])) model = Doc2Vec(docs, dm=0, vector_size=128, min_count=0, workers=4, epochs=10) print("saving model to " + model_file) model.save(model_file)
def main(): os.path.exists("graph.pdf") and os.remove("graph.pdf") # If graph.pdf exists from last run, delete it. docs = utils.read_docs() recommend = RecommenderAlgorithm(docs) recommend.createLanguageList() recommend.createTopicList() recommend.printToFile('indexByTopics.json') recommend.printToFile('indexByLanguages.json') recommend.userInterface()
optimizer = "adam" pad_token_src = 3 docVector = [] doc_names = [] #wt = torch.from_numpy(np.zeros((n_words, src_dim))) #Read the source files data = [] for root, dirn, files in os.walk(load_dir): for f in files: if f.endswith(".txt"): print(f) #Replace self tokenization method by BPE src = read_docs(os.path.join(root, f)) doc_names.append(os.path.join(root, f)) if len(src) > token_len: data.append(src[:token_len]) else: data.append(src) print("\n") if (model_name is not 'bert') and (model_name is not 'scibert'): src, word2idx, idx2word = read_data(data) n_words = len(word2idx) vocab_size = n_words vocab_size = 30004 #preTrain_model = load_Wikiword2vecModel(cache_path) #wt = get_embeddingWeights(preTrain_model, n_words, word2idx, src_dim)