def preprocessing(filenames):
    data = ""
    sentences = []
    words = []

    # Find Sentences and save to file
    data = F.readData(filenames.corpus_name)
    import os
    if(not os.path.isfile(filenames.output_folder+'/'+filenames.sents_file_name)):
        sentences = F.getSentences(data)
        F.save_to_file(filenames.sents_file_name, sentences, filenames.output_folder)
    else:
        print("Sentences File Found")
        sentences=F.load_to_file(filenames.sents_file_name,filenames.output_folder)
    
    if(not os.path.isfile(filenames.output_folder+'/'+filenames.words_file_name))    :
        words = F.getWords(sentences)
        F.save_to_file(filenames.words_file_name, words, filenames.output_folder)
    else:
        print("Words File Found")
        words = F.load_to_file(filenames.words_file_name,filenames.output_folder)
    
    # Find Sentences and save to file
    
    print("Length of text data: ",len(data))

    # updated_words, vocab = F.getVocabulary(words, 400,filenames)
    # updated_words, vocab = F.getVocabulary(words, 300,filenames)
    # updated_words, vocab = F.getVocabulary(words, 200,filenames)
    # updated_words, vocab = F.getVocabulary(words, 100,filenames)
    # updated_words, vocab = F.getVocabulary(words, 75,filenames)
    # updated_words, vocab = F.getVocabulary(words, 50,filenames)
    # updated_words, vocab = F.getVocabulary(words, 25,filenames)
    # updated_words, vocab = F.getVocabulary(words, 20,filenames)
    # updated_words, vocab = F.getVocabulary(words, 15,filenames)
    updated_words, vocab = F.getVocabulary(words, 10,filenames)
    # updated_words, vocab = F.getVocabulary(words, 5,filenames)
    # updated_words, vocab = F.getVocabulary(words, 4,filenames)
    # updated_words, vocab = F.getVocabulary(words, 3,filenames)
    # updated_words, vocab = F.getVocabulary(words, 2,filenames)
    # updated_words, vocab = F.getVocabulary(words, 1,filenames)
    # updated_words, vocab = F.getVocabulary(words, 0,filenames)

    F.save_to_file(filenames.vocab_file, vocab, filenames.output_folder)
    F.save_to_file(filenames.updated_words_file_name, updated_words, filenames.output_folder)

    word_to_index = {}
    index_to_word = {}
    for k, v in enumerate(vocab):
        word_to_index[v] = k
        index_to_word[k] = v

    F.save_to_file(filenames.w2i_file, word_to_index, filenames.output_folder)
    F.save_to_file(filenames.i2w_file, index_to_word, filenames.output_folder)
    print(len(sentences), len(words))
Exemple #2
0
def preprocessing(filenames):
    data = ""
    sentences = []
    words = []
    # if 's' not in F.sys.argv:
    # 	print("A")
    # Find Sentences and save to file
    data = F.readData(filenames.corpus_name)
    sentences = F.getSentences(data)
    F.save_to_file(filenames.sents_file_name, sentences,
                   filenames.output_folder)
    # else:
    # 	print("B")
    # 	sentences=F.load_to_file(filenames.sents_file_name)

    # if 'w' not in F.sys.argv:
    print("C")
    # Find Sentences and save to file
    words = F.getWords(sentences)
    F.save_to_file(filenames.words_file_name, words, filenames.output_folder)
    # else:
    # 	print("D")
    # 	words=F.load_to_file(filenames.words_file_name)

    updated_words, vocab = F.getVocabulary(words, 400)
    F.save_to_file(filenames.vocab_file, vocab, filenames.output_folder)
    F.save_to_file(filenames.updated_words_file_name, updated_words,
                   filenames.output_folder)

    word_to_index = {}
    index_to_word = {}
    for k, v in enumerate(vocab):
        word_to_index[v] = k
        index_to_word[k] = v
    F.save_to_file(filenames.w2i_file, word_to_index, filenames.output_folder)
    F.save_to_file(filenames.i2w_file, index_to_word, filenames.output_folder)

    print(len(sentences), len(words))
     fields = row.split("\t")
     # ID, Titolo, Sottotitolo, Notizia, True=1 Fake=0
     id = int(fields[0])
     title = fields[1]
     subtitle = fields[2]
     corpus = fields[3]
     true = int(fields[4].strip())
     # Devo levare i campi html dal testo della notizia
     corpus = re.sub("<[^>]*>", "", corpus)
     corpus = re.sub("(https||http)(:\/\/)[A-Za-z0-9.\-/_?=%]*", "", corpus)
     # per levare i link
     # (\w+\.)+(\w+\/\w+(\.\w+)?)
     news[id] = [title, subtitle, corpus, true]
     
 for idNews in news:
     wordsTitle = myFunctions.getWords(news[idNews][0])
     wordsSubtitle = myFunctions.getWords(news[idNews][1])
     wordsCorpus = myFunctions.getWords(news[idNews][2])
     
     taggedNullTitle = myFunctions.getWordsNullTag(wordsTitle)
     taggedNullSubtitle = myFunctions.getWordsNullTag(wordsSubtitle)
     taggedNullCorpus = myFunctions.getWordsNullTag(wordsCorpus)
     
     taggedTitle = []
     taggedSubtitle = []
     taggedCorpus = []
     
     '''for i in range(len(wordsTitle)):
         if wordsTitle[i] == "":
             print(wordsTitle)
     
Exemple #4
0
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    with parallel_backend('threading', n_jobs=-1):
        clf.fit(x, y)

    print("Finito il classificatore\n")

    fileName = "modello.sav"
    pickle.dump(clf, open(fileName, 'wb'))

    fileName = "vectorizer.sav"
    pickle.dump(vectorizer, open(fileName, 'wb'))

    prova = "Republican attacks on transgendered Americans and the religious fight to keep gender a binary delineation took a turn for the bizarre this week when Virginia Republican Mark Cole filed a bill that would force schools to check the genitals of their students in order to ensure that they are using facilities reserved for their anatomical sex:Local school boards shall develop and implement policies that require every school restroom, locker room, or shower room that is designated for use by a specific gender to solely be used by individuals whose anatomical sex matches such gender designation. Such policies may also provide that a student may, upon request, be granted access, to the extent reasonable, to a single stall restroom or shower, a unisex bathroom, or controlled individual use of a restroom, locker room, or shower."
    print(prova + "\n")
    parole = miefunzioni.getWords(prova)
    paroleTaggateNulle = []
    for parola in parole:
        #print(parola)
        paroleTaggateNulle.append([parola, 'NULL'])
    print(parole)
    input = []
    for indice in range(len(paroleTaggateNulle)):
        input.append(getFeatures(paroleTaggateNulle, indice))
    xTest = vectorizer.transform(input)
    print(zip(parole, clf.predict(xTest)))

    # Per vedere come si comporta il modello
    #scores = cross_val_score(clf, X, Y, cv=5)
    #print scores
data = ""
sentences = []
words = []
if 's' not in F.sys.argv:
    print("A")
    data = F.readData(corpus_name)
    sentences = F.getSentences(data)
    F.save_to_file(sents_file_name, sentences)
else:
    print("B")
    sentences = F.load_to_file(sents_file_name)

if 'w' not in F.sys.argv:
    print("C")
    words = F.getWords(sentences)
    F.save_to_file(words_file_name, words)
else:
    print("D")
    words = F.load_to_file(words_file_name)

updated_words, vocab = F.getVocabulary(words, 400)
F.save_to_file(vocab_file, vocab)
F.save_to_file(updated_words_file_name, updated_words)

word_to_index = {}
index_to_word = {}
for k, v in enumerate(vocab):
    word_to_index[v] = k
    index_to_word[k] = v
F.save_to_file(w2i_file, word_to_index)