コード例 #1
0
def main():   
    docs = utils.read_docs()
    recommend = RecommenderAlgorithm(docs)
    recommend.createLanguageList()              
    recommend.createTopicList()
    recommend.printToFile('indexByTopics.json')
    recommend.printToFile('indexByLanguages.json')
    recommend.userInterface()
コード例 #2
0
ファイル: doc2vec_run.py プロジェクト: soon14/wiz-classif
def train_model(docs_dir, model_file):
    docs = utils.read_docs(lambda str, f: TaggedDocument(utils.cut_and_remove_stopwords(str), [f]))

    model = Doc2Vec(docs, dm=0, vector_size=128, min_count=0, workers=4, epochs=10)

    print("saving model to " + model_file)

    model.save(model_file)
コード例 #3
0
def main():   
    os.path.exists("graph.pdf") and os.remove("graph.pdf") # If graph.pdf exists from last run, delete it.
    docs = utils.read_docs()
    recommend = RecommenderAlgorithm(docs)
    recommend.createLanguageList()              
    recommend.createTopicList()
    recommend.printToFile('indexByTopics.json')
    recommend.printToFile('indexByLanguages.json')
    recommend.userInterface()
コード例 #4
0
optimizer = "adam"
pad_token_src = 3

docVector = []
doc_names = []
#wt = torch.from_numpy(np.zeros((n_words, src_dim)))

#Read the source files
data = []
for root, dirn, files in os.walk(load_dir):
    for f in files:
        if f.endswith(".txt"):
            print(f)
            #Replace self tokenization method by BPE
            src = read_docs(os.path.join(root, f))
            doc_names.append(os.path.join(root, f))
            if len(src) > token_len:
                data.append(src[:token_len])
            else:
                data.append(src)
            print("\n")

if (model_name is not 'bert') and (model_name is not 'scibert'):
    src, word2idx, idx2word = read_data(data)
    n_words = len(word2idx)
    vocab_size = n_words
    vocab_size = 30004
#preTrain_model = load_Wikiword2vecModel(cache_path)
#wt = get_embeddingWeights(preTrain_model, n_words, word2idx, src_dim)