Beispiel #1
0
def read_and_train_doc2vec(root_dir, fileids, output_file='', options={}):
    fileids = fileids if isinstance(fileids, list) else [fileids]
    fileids = [unicode(f, 'utf8') for f in fileids]
    output_file = output_file or '-'.join(fileids)
    output_file = u"{0}{1}-{2}".format(MODELS_DIR, output_file,
                                       options_to_string(options))
    reader = PlaintextCorpusReader(root=root_dir, fileids=fileids)
    try:
        docs = [
            TaggedDocument(reader.words(fileid), [fileid])
            for fileid in fileids
        ]
        train_and_save_doc2vec(docs, output_file, options)
    except UnicodeDecodeError:
        file_encodings = {}
        for fileid in fileids:
            file_content = open(root_dir + fileid).read()
            file_encoding = chardet.detect(file_content)
            file_encodings[fileid] = file_encoding['encoding']
        reader._encoding = file_encodings
        pdb.set_trace()
        docs = [
            TaggedDocument(reader.words(fileid), [fileid])
            for fileid in fileids
        ]
        train_and_save_doc2vec(docs, output_file, options)
Beispiel #2
0
def read_and_train(root_dir, fileids, output_file='', options={}):
    fileids = fileids if isinstance(fileids, list) else [fileids]
    fileids = [unicode(f, 'utf8') for f in fileids]
    output_file = output_file or '-'.join(fileids)
    output_file = u"{0}{1}-{2}".format(MODELS_DIR, output_file,
                                       options_to_string(options))
    reader = PlaintextCorpusReader(root=root_dir, fileids=fileids)
    try:
        sents = reader.sents()
        print fileids
        train_and_save(sents, output_file, options)
    except UnicodeDecodeError:
        print "here"
        file_encodings = {}
        for fileid in fileids:
            file_content = open(root_dir + fileid).read()
            file_encoding = chardet.detect(file_content)
            file_encodings[fileid] = file_encoding['encoding']
        reader._encoding = file_encodings
        sents = reader.sents()
        train_and_save(sents, output_file, options)