def preprocessing(filenames): data = "" sentences = [] words = [] # Find Sentences and save to file data = F.readData(filenames.corpus_name) import os if(not os.path.isfile(filenames.output_folder+'/'+filenames.sents_file_name)): sentences = F.getSentences(data) F.save_to_file(filenames.sents_file_name, sentences, filenames.output_folder) else: print("Sentences File Found") sentences=F.load_to_file(filenames.sents_file_name,filenames.output_folder) if(not os.path.isfile(filenames.output_folder+'/'+filenames.words_file_name)) : words = F.getWords(sentences) F.save_to_file(filenames.words_file_name, words, filenames.output_folder) else: print("Words File Found") words = F.load_to_file(filenames.words_file_name,filenames.output_folder) # Find Sentences and save to file print("Length of text data: ",len(data)) # updated_words, vocab = F.getVocabulary(words, 400,filenames) # updated_words, vocab = F.getVocabulary(words, 300,filenames) # updated_words, vocab = F.getVocabulary(words, 200,filenames) # updated_words, vocab = F.getVocabulary(words, 100,filenames) # updated_words, vocab = F.getVocabulary(words, 75,filenames) # updated_words, vocab = F.getVocabulary(words, 50,filenames) # updated_words, vocab = F.getVocabulary(words, 25,filenames) # updated_words, vocab = F.getVocabulary(words, 20,filenames) # updated_words, vocab = F.getVocabulary(words, 15,filenames) updated_words, vocab = F.getVocabulary(words, 10,filenames) # updated_words, vocab = F.getVocabulary(words, 5,filenames) # updated_words, vocab = F.getVocabulary(words, 4,filenames) # updated_words, vocab = F.getVocabulary(words, 3,filenames) # updated_words, vocab = F.getVocabulary(words, 2,filenames) # updated_words, vocab = F.getVocabulary(words, 1,filenames) # updated_words, vocab = F.getVocabulary(words, 0,filenames) F.save_to_file(filenames.vocab_file, vocab, filenames.output_folder) F.save_to_file(filenames.updated_words_file_name, updated_words, filenames.output_folder) word_to_index = {} index_to_word = {} for k, v in enumerate(vocab): word_to_index[v] = k index_to_word[k] = v F.save_to_file(filenames.w2i_file, word_to_index, filenames.output_folder) F.save_to_file(filenames.i2w_file, index_to_word, filenames.output_folder) print(len(sentences), len(words))
def preprocessing(filenames): data = "" sentences = [] words = [] # if 's' not in F.sys.argv: # print("A") # Find Sentences and save to file data = F.readData(filenames.corpus_name) sentences = F.getSentences(data) F.save_to_file(filenames.sents_file_name, sentences, filenames.output_folder) # else: # print("B") # sentences=F.load_to_file(filenames.sents_file_name) # if 'w' not in F.sys.argv: print("C") # Find Sentences and save to file words = F.getWords(sentences) F.save_to_file(filenames.words_file_name, words, filenames.output_folder) # else: # print("D") # words=F.load_to_file(filenames.words_file_name) updated_words, vocab = F.getVocabulary(words, 400) F.save_to_file(filenames.vocab_file, vocab, filenames.output_folder) F.save_to_file(filenames.updated_words_file_name, updated_words, filenames.output_folder) word_to_index = {} index_to_word = {} for k, v in enumerate(vocab): word_to_index[v] = k index_to_word[k] = v F.save_to_file(filenames.w2i_file, word_to_index, filenames.output_folder) F.save_to_file(filenames.i2w_file, index_to_word, filenames.output_folder) print(len(sentences), len(words))
fields = row.split("\t") # ID, Titolo, Sottotitolo, Notizia, True=1 Fake=0 id = int(fields[0]) title = fields[1] subtitle = fields[2] corpus = fields[3] true = int(fields[4].strip()) # Devo levare i campi html dal testo della notizia corpus = re.sub("<[^>]*>", "", corpus) corpus = re.sub("(https||http)(:\/\/)[A-Za-z0-9.\-/_?=%]*", "", corpus) # per levare i link # (\w+\.)+(\w+\/\w+(\.\w+)?) news[id] = [title, subtitle, corpus, true] for idNews in news: wordsTitle = myFunctions.getWords(news[idNews][0]) wordsSubtitle = myFunctions.getWords(news[idNews][1]) wordsCorpus = myFunctions.getWords(news[idNews][2]) taggedNullTitle = myFunctions.getWordsNullTag(wordsTitle) taggedNullSubtitle = myFunctions.getWordsNullTag(wordsSubtitle) taggedNullCorpus = myFunctions.getWordsNullTag(wordsCorpus) taggedTitle = [] taggedSubtitle = [] taggedCorpus = [] '''for i in range(len(wordsTitle)): if wordsTitle[i] == "": print(wordsTitle)
clf = tree.DecisionTreeClassifier(criterion='entropy') with parallel_backend('threading', n_jobs=-1): clf.fit(x, y) print("Finito il classificatore\n") fileName = "modello.sav" pickle.dump(clf, open(fileName, 'wb')) fileName = "vectorizer.sav" pickle.dump(vectorizer, open(fileName, 'wb')) prova = "Republican attacks on transgendered Americans and the religious fight to keep gender a binary delineation took a turn for the bizarre this week when Virginia Republican Mark Cole filed a bill that would force schools to check the genitals of their students in order to ensure that they are using facilities reserved for their anatomical sex:Local school boards shall develop and implement policies that require every school restroom, locker room, or shower room that is designated for use by a specific gender to solely be used by individuals whose anatomical sex matches such gender designation. Such policies may also provide that a student may, upon request, be granted access, to the extent reasonable, to a single stall restroom or shower, a unisex bathroom, or controlled individual use of a restroom, locker room, or shower." print(prova + "\n") parole = miefunzioni.getWords(prova) paroleTaggateNulle = [] for parola in parole: #print(parola) paroleTaggateNulle.append([parola, 'NULL']) print(parole) input = [] for indice in range(len(paroleTaggateNulle)): input.append(getFeatures(paroleTaggateNulle, indice)) xTest = vectorizer.transform(input) print(zip(parole, clf.predict(xTest))) # Per vedere come si comporta il modello #scores = cross_val_score(clf, X, Y, cv=5) #print scores
data = "" sentences = [] words = [] if 's' not in F.sys.argv: print("A") data = F.readData(corpus_name) sentences = F.getSentences(data) F.save_to_file(sents_file_name, sentences) else: print("B") sentences = F.load_to_file(sents_file_name) if 'w' not in F.sys.argv: print("C") words = F.getWords(sentences) F.save_to_file(words_file_name, words) else: print("D") words = F.load_to_file(words_file_name) updated_words, vocab = F.getVocabulary(words, 400) F.save_to_file(vocab_file, vocab) F.save_to_file(updated_words_file_name, updated_words) word_to_index = {} index_to_word = {} for k, v in enumerate(vocab): word_to_index[v] = k index_to_word[k] = v F.save_to_file(w2i_file, word_to_index)