コード例 #1
0
    for file in files:
        file_path = dir_path + file
        file_name, file_extension = splitext(file_path)
        doc = ""

        if file_extension == ".pdf":
            doc = convert_pdf_to_txt(file_path)
        elif file_extension == ".docx":
            doc = convert_docx_to_txt(file_path)
        else:
            continue

        if doc != "":
            doc_sentence = []
            doc = doc.decode("utf8").encode('ascii', 'ignore').decode('ascii')
            doc = words_to_phrases(doc)
            doc = doc.lower()
            sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=True)
            doc_sentence = doc_to_sentences(doc, tokenizer, remove_stopwords=True)
            print(file_path)
            file1 = file.split("_")
            month = file1[2].split(".")[0]+"-"+file1[1]
            author = file1[0]
            for item in doc_sentence:
                item = " ".join(item)
                #sentences_string += "\n"+item+" | "+author+" | "+file.decode("utf8").encode('ascii', 'ignore').decode('ascii')+" | "+month


print(sentences_string.count("\n"))
with open("sentences.txt", "w") as text_file:
    text_file.write("%s" % sentences_string)
コード例 #2
0
        doc = ""

        if file_extension == ".pdf":
            doc = convert_pdf_to_txt(file_path)
        elif file_extension == ".docx":
            doc = convert_docx_to_txt(file_path)
        else:
            continue

        if doc != "":
            doc_sentence = []
            doc = doc.decode("utf8").encode('ascii', 'ignore').decode('ascii')
            doc = words_to_phrases(doc)
            doc = doc.lower()
            sentences += doc_to_sentences(doc,
                                          tokenizer,
                                          remove_stopwords=True)
            doc_sentence = doc_to_sentences(doc,
                                            tokenizer,
                                            remove_stopwords=True)
            print(file_path)
            file1 = file.split("_")
            month = file1[2].split(".")[0] + "-" + file1[1]
            author = file1[0]
            for item in doc_sentence:
                item = " ".join(item)
                #sentences_string += "\n"+item+" | "+author+" | "+file.decode("utf8").encode('ascii', 'ignore').decode('ascii')+" | "+month

print(sentences_string.count("\n"))
with open("sentences.txt", "w") as text_file:
    text_file.write("%s" % sentences_string)
コード例 #3
0
        try:
            if file_extension == ".pdf":
                doc = convert_pdf_to_txt(file_path)
            elif file_extension == ".docx":
                doc = convert_docx_to_txt(file_path)
            else:
                continue
        except:
            continue

        if doc != "":
            doc = doc.decode("utf8")
            doc = words_to_phrases(doc)
            doc = doc.lower()

            sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False)


print len(sentences)
# sentences_string = ""
# for item in sentences:
#     item = " ".join(item)
#     sentences_string += "\n"+item
#
#
# with open("sentences.txt", "w") as text_file:
#     text_file.write("%s" % sentences_string)
#print sentences[0]


# Set values for various parameters
コード例 #4
0
            doc = fp.read()
        # elif str(file).__contains__(".pdf"):
        #     doc = convert_pdf_to_txt(file)
        # elif str(file).__contains__(".docx"):
        #     doc = convert_docx_to_txt(file)
        else:
            continue
    except:
        continue

    if doc != "":
        doc = doc.decode("utf8")
        doc = words_to_phrases(doc)
        doc = doc.lower()

        sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False)

print len(sentences)
with open("csc791_sentences.txt", "w") as text_file:
    text_file.write("%s" % sentences)

# Set values for various parameters
num_features = 300  # Word vector dimensionality
min_word_count = 10  # Minimum word count
num_workers = 2  # Number of threads to run in parallel
context = 10  # Context window size
downsampling = 1e-3  # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
print "Training model..."
model = word2vec.Word2Vec(sentences,