def createTrainingVectors(tokenized_texts_dict):
    """
        Given the filenames and their contents, this methods creates the training 
        vectors by creating a unique list of all words together in the training
        set
    """
    print("Creating vectors for training data")

    unique_words = []
    for filename, text in tokenized_texts_dict.iteritems():
        # print("Reading {0} and adding to unique word list".format(filename))
        unique_words.extend(word_tokenize(text))

    unique_words = set(unique_words)

    # Creating the initial vector with counts 0 for all training sets
    zero_vector = OrderedDict(zip(unique_words, [0] * len(unique_words)))
    print("Creating the zero vector")

    # For each training file, create an OrderedDict containing its word counts (together with zero counts),
    # and store it in a dict, indexed by its corresponding filename
    vectors = {}
    for filename, token_list in tokenized_texts_dict.iteritems():
        current_vector = zero_vector.copy()
        current_vector.update(Counter(word_tokenize(token_list)))
        vectors[filename] = current_vector

    return vectors, zero_vector
def cleaned_bag_of_words_dataset(data_matrix, stemming=False, stop_words=None, TFIDF=False, ngram_range=(1, 1), max_features=None,
                                 length=False, number_in_tweet=False, words_present=[]):
    if stemming:
        stemmer = SnowballStemmer("english")
        tweets = [" ".join([stemmer.stem(word) for word in word_tokenize(data_point[2].lower().decode("utf8"))]) for data_point in data_matrix]
    else:
        tweets = [data_point[2].lower() for data_point in data_matrix]
        
    if TFIDF:
        vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=ngram_range, max_features=max_features)
    else:
        vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=ngram_range, max_features=max_features)
    
    dataset = vectorizer.fit_transform(tweets).toarray()
    
    if length:
        lengths = np.array([[len(word_tokenize(data_point[2].decode("utf8")))] for data_point in data_matrix])
        dataset = np.concatenate((dataset, lengths), axis=1)
     
    if number_in_tweet:
        numbers = []
        for data_point in data_matrix:
            number_list = list_of_ints_from_string(data_point[2])
            filtered_number_list = [number for number in number_list if abs(number) < 10]
            if len(filtered_number_list) == 0:
                numbers.append([0])
            else:
                numbers.append([np.mean(filtered_number_list)])
        dataset = np.concatenate((dataset, numbers), axis=1)

    for word in words_present:
        word_present = np.array([[int(word.lower() in word_tokenize(data_point[2].lower().decode("utf8")))] for data_point in data_matrix])
        dataset = np.concatenate((dataset, word_present), axis=1)
        
    return dataset
Example #3
0
def tokenize_sentences(filename):
	file_dir = docs_dir + str(filename)
	f = open(file_dir, 'r')

	root = ET.parse(f).getroot()
	tags = root.getiterator('str')

	# read the relevant tags
	title_string = ''
	desc_string = ''
	for tag in tags:
		if tag.get('name')  == 'Title' :
			title_string = filter(lambda x: x in string.printable, tag.text.lower().strip())

		elif tag.get('name') == 'Abstract':
			desc_string = filter(lambda x: x in string.printable, tag.text.lower().strip().replace('relevant documents will describe', ''))

	f.close()

	sentences = sent_tokenize(title_string)
	title_words = []
	for s in sentences:
		title_words = title_words + word_tokenize(s)

	sentences = sent_tokenize(desc_string)
	desc_words = []
	for s in sentences:
		desc_words = desc_words + word_tokenize(s)

	
	return (title_words, desc_words)
Example #4
0
def search(dictionary_file, postings_file, query_file, output_file):
    try:
        # Remove previous output file
        os.remove(output_file)
    except OSError:
        pass
    inverted_index = InvertedIndex(dictionary_file, postings_file)
    meta_data = get_meta_data()

    tree = ET.parse(query_file)
    root = tree.getroot()
    title_tokens = []
    description_tokens = []

    raw_tokens = []

    for child in root:
        if child.tag == 'title':
            title_tokens = build_tokens(child.text)
            raw_tokens.extend(word_tokenize(child.text))
        elif child.tag == 'description':
            description_tokens = build_tokens(child.text)
            raw_tokens.extend(word_tokenize(child.text))

    raw_tokens = helper.remove_stop_words_without_normalize(helper.filter_invalid_characters(raw_tokens))
    additional_tokens = []
    for token in list(set(raw_tokens)):
        additional_tokens.extend(helper.get_similar_words(token))
        

    title_tokens = helper.remove_stop_words(helper.filter_invalid_characters(title_tokens))
    description_tokens = helper.remove_stop_words(helper.filter_invalid_characters(description_tokens))

    # tight results are results which favour high precision. We use this as a proxy for true positive
    tight_results = execute_query(title_tokens, description_tokens, [], inverted_index, meta_data)
    global top_UPC_classes
    global top_IPC_classes
    global top_family_members
    global top_cited_by

    # Get top UPC, IPC, family members and cited by from our true positive proxy results
    # This helps us determine which documents are more similar to the original top results
    # when we add in the additional similar words
    top_UPC_classes = get_top_classes(tight_results, meta_data['UPC_class'], 6)
    top_IPC_classes = get_top_classes(tight_results, meta_data['IPC_class'], 4)
    top_family_members = get_top_members(tight_results, meta_data['family_members'], 20)
    top_cited_by = get_top_members(tight_results, meta_data['cited_by'], 20)
    
    # query expansion 
    # supplementary_results = expand_query(tight_results, meta_data['doc_top_terms'], inverted_index, meta_data)
    
    # synonyms, hypernyms
    additional_tokens = helper.normalize_tokens(list(set(additional_tokens)))

    results = execute_query(title_tokens, description_tokens, additional_tokens, inverted_index, meta_data)

    k = int(TOP_X_PERCENT_RESULTS * len(results))
    # j = int(TOP_X_PERCENT_RESULTS * len(supplementary_results))
    # results = list(set(results[:k] + supplementary_results[:j]))
    write_to_output(output_file, results[:k])
Example #5
0
def max_similarity(context_sentence, ambiguous_word, option="path", 
                   lemma=True, context_is_lemmatized=False, pos=None, best=True):
    """
    Perform WSD by maximizing the sum of maximum similarity between possible 
    synsets of all words in the context sentence and the possible synsets of the 
    ambiguous words (see http://goo.gl/XMq2BI):
    {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))}
    """
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    if context_is_lemmatized:
        context_sentence = word_tokenize(context_sentence)
    else:
        context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)]
    result = {}
    for i in wn.synsets(ambiguous_word):
        try:
            if pos and pos != str(i.pos()):
                continue
        except:
            if pos and pos != str(i.pos):
                continue 
        result[i] = sum(max([sim(i,k,option) for k in wn.synsets(j)]+[0]) \
                        for j in context_sentence)
    
    if option in ["res","resnik"]: # lower score = more similar
        result = sorted([(v,k) for k,v in result.items()])
    else: # higher score = more similar
        result = sorted([(v,k) for k,v in result.items()],reverse=True)
    ##print result
    if best: return result[0][1];
    return result
Example #6
0
def main():
    # Load up txt files
    speech_file = open('trump-speeches/speeches.txt').read()
    tweets = json.load(open('trump_tweets.json'))
    tweet_list = []
    for tweet in tweets:
        tweet_list.append(tweet['text'])
    tweet_list = ' '.join(tweet_list)

    # Tokenize
    logging.info('Formatting training text')
    speech_token = word_tokenize(speech_file)
    tweet_token = word_tokenize(tweet_list)

    # Train trigram models
    logging.info('Setting up models')
    speech_gram, speech_format = ngram(speech_token, 3)
    tweet_gram, tweet_format = ngram(tweet_token, 3)

    # Generate responses
    cont = True
    while cont:
        response = input("Hello sir, what can I Trumpinate for you?: ")
        num_words = input("And how many words should I write?: ")

        # Print Phrases
        gen_phrase(speech_gram, int(num_words), starter_word=[response])
        print('')
        gen_phrase(tweet_gram, int(num_words), starter_word=[response])
        more = input("Would you like to generate more? (Yes, No): ")
        if more != 'Yes':
            cont = False
def get_cluster(s1, s2, dataset):
  """
    Return "cluster" (i.e. video or picture name) that the sentences came from
  """

  if dataset == 'FLICKR':
    data_reverse = flickr_reverse
    sent_1 = ' '.join(word_tokenize(s1))
    sent_2 = ' '.join(word_tokenize(s2))
  else:
    data_reverse = msr_reverse
    sent_1 = s1
    sent_2 = s2

  if s1 not in data_reverse:
    return None
  if s2 not in data_reverse:
    return None


  candidates_1 = set(data_reverse[s1])
  candidates_2 = set(data_reverse[s2])
  
  if len(candidates_1 & candidates_2) > 0:
    return list(candidates_1 & candidates_2)[0]
def getBigramBeginWithNotCount(sent):
    negative_keywords = ["bad", "sad", "don't", "could not", "crappy", "unfortunately", "remove", "why", "poor",
                     "bothersome", "terrible", "although", "complaints", "outrageous", "isn't", "poorly",
                     "drawback", "annoying", "against", "irritating", "wouldn't", "won't", "wasn't", "couldn't",
                     "awful", "didn't", "hasn't", "difficult", "hate", "incorrect", "junk", "trash", "removed",
                         "complain", "complained", "hated", "negative"]
    bigramPostiveCount = 0
    '''
    from nltk.corpus import brown
    brown_tagged_sents = brown.tagged_sents(categories='news')
    brown_sents = brown.sents(categories='news')
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)

    for bigram in nltk.bigrams(word_tokenize(sent)):
        if bigram[0].lower() == "not" and bigram[1].lower() in negative_keywords:
            print sent
            print bigram
            print unigram_tagger.tag(word_tokenize(sent))
            bigramNotCount += 1
    '''
    for i, word in enumerate(word_tokenize(sent)):
        if word.lower() == "not":
            if word_tokenize(sent)[i + 1] in negative_keywords : # e.g. NOT bad
                bigramPostiveCount += 1
            if i < len(word_tokenize(sent)) - 2 and word_tokenize(sent)[i + 2] in negative_keywords: # e.g. NOT too bad
                bigramPostiveCount += 1
            else:                                                # e.g. NOT good
                bigramPostiveCount -= 1
    return bigramPostiveCount
Example #9
0
def word_standardize(sentences): 	
    tokens = []
    sentences_st = []

    for sent in sentences:
        tokens.extend(word_tokenize(sent))
        sentences_st.append(word_tokenize(sent))
	
    words = tokens
    
    st = LancasterStemmer()

    words = [w.lower() for w in words]
    words = [w for w in words if not w in stopwords.words('english')]
    words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']
    st_words = [st.stem(w) for w in words]

    sent_result = []
    for sent in sentences_st:
        sent = [w.lower() for w in sent]
        sent = [w for w in sent if not w in stopwords.words('english')]
        sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']
        sent_result.append(sent)

    return st_words, sent_result
def test(testAccents, testNoAccents, dictnoAccents):
    
    count = 0
    correct = 0
    notWord = []
    result = []
    incorrect = {}
    wordCount = 0
    nonWordCount = 0
    for i in range(len(testAccents)):
       
       
        sent = ""
        sentenceAccents = testAccents[i]
        sentenceNoAccents = testNoAccents[i]
            
        tokensAccents = word_tokenize(sentenceAccents)
        tokensNoAccents = word_tokenize(sentenceNoAccents)
        
        if len(tokensAccents) == len(tokensNoAccents):
            for j in range(len(tokensAccents)):
                tA = tokensAccents[j]
                tNA = tokensNoAccents[j]
                if tNA not in punctuation and not tNA.isdigit():
                    wordCount +=1
                    if tNA in dictnoAccents.keys():
                        
                        newToken = max(dictnoAccents[tNA], key=dictnoAccents[tNA].get)
                        #print(newToken)
                        #print("YES")
                    else:
                        newToken = tNA
                    if newToken == tA:
                        correct +=1
                    else:
                        incorrect[newToken] = tA
                       # print(newToken)
                       # print(tA)
                    count +=1
                    
                    #print("HI")
                    if j != 0:
                        newToken = " " + newToken
                else:   
                    
                    nonWordCount  +=1
                   
                    
                    notWord.append(tNA)
                    newToken = tNA
                sent = sent + newToken
       
            result.append(sent)
      
    print("Le nombre de mot dans le corpus: " + str(wordCount) )
    print("Le nombre de ponctuation et de nombres dans le corpus: " + str(nonWordCount))
    print("Nombre au total de changements/non changements possibles " + str(count ))
    print("Nombre au total de decisions correctes " + str(correct))
    print("Accuracy: " + str(correct/count) )
    return([incorrect,correct/count, wordCount, nonWordCount])
Example #11
0
def load_anssel_samples(qtext, atexts):
    samples = []
    qtext = word_tokenize(qtext)
    for atext in atexts:
        atext = word_tokenize(atext)
        samples.append({'qtext': ' '.join(qtext), 'label': 0, 'atext': ' '.join(atext)})
    return samples
Example #12
0
def load_data(loc='./data/'):
    """
    Load MSRP dataset
    """
    trainloc = os.path.join(loc, 'msr_paraphrase_train.txt')
    testloc = os.path.join(loc, 'msr_paraphrase_test.txt')

    trainA, trainB, testA, testB = [],[],[],[]
    trainS, devS, testS = [],[],[]

    f = open(trainloc, 'rb')
    for line in f:
        text = line.strip().split('\t')
        trainA.append(' '.join(word_tokenize(text[3])))
        trainB.append(' '.join(word_tokenize(text[4])))
        trainS.append(text[0])
    f.close()
    f = open(testloc, 'rb')
    for line in f:
        text = line.strip().split('\t')
        testA.append(' '.join(word_tokenize(text[3])))
        testB.append(' '.join(word_tokenize(text[4])))
        testS.append(text[0])
    f.close()

    trainS = [int(s) for s in trainS[1:]]
    testS = [int(s) for s in testS[1:]]

    return [trainA[1:], trainB[1:]], [testA[1:], testB[1:]], [trainS, testS]
Example #13
0
def write_anotations_to_file(lst_annotation, file_name):

    with codecs.open(file_name, 'w', 'utf-8') as f:
        for annotation in lst_annotation:
            annotation_full_text = annotation.text

            car_name = preprocessor_text(annotation.name)

            annotation_start = annotation_full_text.find(car_name)
            annotation_end = annotation.start + len(car_name)

            full_text_before_annotation = preprocessor_text(annotation_full_text[:annotation_start].strip())

            before_tokens = word_tokenize(full_text_before_annotation)

            for token in before_tokens:
                f.write( token + u' ' + u'O' + u'\n' )

            annotation_tokens = word_tokenize(car_name)
            for idx, token in enumerate(annotation_tokens):
                if idx == 0:
                    label = u'B'
                else:
                    label = u'I'
                f.write( token + u' ' + label + u'\n' )

            full_text_after_annotation =  preprocessor_text(annotation_full_text[annotation_end:]).strip()

            after_tokens = word_tokenize(full_text_after_annotation)

            for token in after_tokens:
                f.write( token + u' ' + u'O' + '\n' )
            f.write( u'\n' )
def tokenize(s, stem=True, digit=False, stop=True, use_re=False):
    """
    :type s: str
    :type stem: bool
    :type use_re: bool
    :rtype: set(str)
    """
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    wordnet = WordNetLemmatizer()
    table = string.maketrans("","")

    if use_re:
        s = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)

    if digit:
        tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation + string.digits)))
    else:
        tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation)))

    if stop:
        tokens = set(word for word in tokens if word not in stop_words)

    if stem:
        tokens = set(stemmer.stem(word) for word in tokens)

    return tokens
Example #15
0
def load_ace_file(textfile, fmt):
    print '  - %s' % os.path.split(textfile)[1]
    annfile = textfile+'.tmx.rdc.xml'

    # Read the xml file, and get a list of entities
    entities = []
    xml = ET.parse(open(annfile)).getroot()
    for entity in xml.findall('document/entity'):
        typ = entity.find('entity_type').text
        for mention in entity.findall('entity_mention'):
            if mention.get('TYPE') != 'NAME': continue # only NEs
            s = int(mention.find('head/charseq/start').text)
            e = int(mention.find('head/charseq/end').text)+1
            entities.append( (s, e, typ) )

    # Read the text file, and mark the entities.
    text = open(textfile).read()
    
    # Strip XML tags, since they don't count towards the indices
    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)

    # Blank out anything before/after <TEXT>
    def subfunc(m): return ' '*(m.end()-m.start()-6)
    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
    text = re.sub('</TEXT>[\s\S]*', '', text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s,e,typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == 'binary':
        i = 0
        toks = Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree('NE', text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == 'multiclass':
        i = 0
        toks = Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree(typ, text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError('bad fmt value')
def best_bigrams(sents_tagged, stopwords, score_fn=BigramAssocMeasures.likelihood_ratio, n=300):
    sents_pos = []
    sents_neg = []

    # Separate positive and negative sentences.
    for tag, sent in sents_tagged:
        if tag == 1:
            sents_pos.append(sent)
        elif tag == -1:
            sents_neg.append(sent)

    # Extract words from positive and negative sentences.
    words_pos = [word.lower() for s in sents_pos for word in word_tokenize(s) if word not in string.punctuation]
    words_neg = [word.lower() for s in sents_neg for word in word_tokenize(s) if word not in string.punctuation]

    # Find the best bigrams for positive sentences based on informative collocations
    bigram_finder1 = BigramCollocationFinder.from_words(words_pos)
    bigrams_best_pos = bigram_finder1.nbest(score_fn, n)

    # Find the best bigrams for negative sentences based on informative collocations
    bigram_finder2 = BigramCollocationFinder.from_words(words_neg)
    bigrams_best_neg = bigram_finder2.nbest(score_fn, n)

    bigrams_all = list(set(bigrams_best_pos).union(set(bigrams_best_neg)))

    # Select only the bigrams that have either one of the word greater than length 3
    bigrams_best = [bigram for bigram in bigrams_all
            if len(bigram[0]) > 3 and len(bigram[1]) > 3
            and bigram[0] not in ex and bigram[1] not in ex ]


    return bigrams_best
Example #17
0
    def __next__(self):
        if self.sentence == False: # we will treat one document = one hansard statement
            res = cursor.fetchone()
            if res == None:
                raise StopIteration
            else:
                x = self.sentenceHandler(res)
                x=word_tokenize(x) # tokenize

                # optional stemmer
            
                #wnl = EnglishStemmer()
                #lemmed = []
                #for word in x:
                #    newword=wnl.stem(word)
                #    lemmed.append(newword)
        
                y=doStop(x) # remove stopwords and procedural words
                x=doProcedural(y)
                return (x)
        else: # we will treat one document = one sentence
            if self.paragraphInProgress==False:
                # this is a new paragraph, so fetch it
                res = cursor.fetchone()
                if res == None:
                    raise StopIteration
                else: # new paragraph fetched successfully
                    self.paragraphInProgress==True
                    x = self.sentenceHandler(res)
                    self.workingParagraph = sent_tokenize(x)

                    doc = self.workingParagraph.pop(0)
                    
                    doc=word_tokenize(doc) # tokenize
                    y=doStop(doc) # remove stopwords and procedural words
                    x=doProcedural(y)
                    
                    # before we end, check whether this was a one-sentence paragraph
                    
                    if len(self.workingParagraph)==0:
                        self.paragraphInProgress==False
                        
                    return (x)

            elif self.paragraphInProgress==True:
                # we have already started a paragraph with list of sentences, so pop the first one and yield it as tokens
                # if length becomes 0 at the end, reset the paragraphInProgress flag
                
                    doc = self.workingParagraph.pop(0)
                    
                    doc=word_tokenize(doc) # tokenize
                    y=doStop(doc) # remove stopwords and procedural words
                    x=doProcedural(y)
                    
                    # before we end, check whether this was a one-sentence paragraph
                    
                    if len(self.workingParagraph)==0:
                        self.paragraphInProgress==False
                        
                    return (x)
Example #18
0
def sum_basic(lines, word_limit, update_non_redundency=True):
    def weight(sents, distribution):
        def _weight_sent(sent):
            tokens = preprocess(word_tokenize(sent))
            return reduce(lambda x,y: x+y, [distribution.get(x) for x in tokens]) / len(tokens)
            
        return [_weight_sent(sent) for sent in sents]
    
    def probability_distribution(tokens):
        N = len(tokens)
        distinct_words = set(tokens)
        
        probabilities = map(lambda w: tokens.count(w) / N , distinct_words)
        return dict(zip(distinct_words, probabilities))
    
    sents = to_sents(lines)
    tokens = to_tokens(sents)
    tokens = preprocess(tokens)
    
    pd = probability_distribution(tokens)
    
    summary = "" 
    
    while len(word_tokenize(summary)) < word_limit:
        weights = weight(sents, pd)
        highest_weight_sentence = max(zip(sents, weights), key=itemgetter(1))[0]
        summary += " " + highest_weight_sentence
        if update_non_redundency:
            for token in preprocess(word_tokenize(highest_weight_sentence)):
                pd[token] = pd[token] * pd[token]
        else:
            sents.remove(highest_weight_sentence)
            
   
    return summary 
    def sentence_matches(self, sentence_text):
        """Returns true iff the sentence contains this mention's upstream
        and downstream participants, and if one of the stemmed verbs in
        the sentence is the same as the stemmed action type."""
        has_upstream = False
        has_downstream = False
        has_verb = False

        # Get the first word of the action type and assume this is the verb
        # (Ex. get depends for depends on)
        actiontype_words = word_tokenize(self.mention.actiontype)
        actiontype_verb_stemmed = stem(actiontype_words[0])

        words = word_tokenize(sentence_text)

        if self.string_matches_sans_whitespace(sentence_text.lower(),
            self.mention.upstream.lower()):
            has_upstream = True

        if self.string_matches_sans_whitespace(sentence_text.lower(),
            self.mention.downstream.lower()):
            has_downstream = True

        for word in words:
            if actiontype_verb_stemmed == stem(word):
                has_verb = True

        return has_upstream and has_downstream and has_verb
 def post(self):
     args = parser.parse_args()
     text = {'text': args['text']}
     print text
     print sent_tokenize(text['text'])
     print word_tokenize(text['text'])
     return text['text']
Example #21
0
def get_doc_abstract_query_List(norm):
    ranked_top_10_doc_list = map(operator.itemgetter(0), ranked_scores_top_10)
    result_query = ""
    count  = 0 
    synonym_words_list = []
    
    for docID in ranked_top_10_doc_list:
        if dir_of_docs.endswith("/"):
            docID_file_dir = dir_of_docs + docID + ".xml"
        else:
            docID_file_dir = dir_of_docs + "/" + docID + ".xml"
            
        xml_doc = Document(docID, docID_file_dir)
        title = xml_doc.get_title()
        result_query +=  title + " "
        
        """
        if count < 1: # Only get abstract from top document(s)
            result_query += xml_doc.get_abstract() + " "
        """
    
        # Adds synonyms for the top ranked document's title to new query
        if count <= 10:
            title_words = word_tokenize(title)
            for w in title_words:
                synonym_words_list = norm.combine_list(synonym_words_list, norm.get_synonym_list(w))
        count += 1
        
    result_query_list = word_tokenize(result_query)
    result_query_list = norm.combine_list(result_query_list, synonym_words_list)
    normalized = norm.normalize_tokens(result_query_list)
    return normalized
Example #22
0
    def calculate_pmi_use_case2(self, schema):
        print("Calculating PMI for " + schema)
        corpus_count = 0
        text = []
        for item in self.__mongo_db.get(schema, {}):
            text += word_tokenize(item['text'], language='german')
            corpus_count += len(word_tokenize(item['text'], language='german'))
        print(corpus_count)
        counter = Counter(text)
        single_pattern_table = self.__postgre_db.get_data_from_table(schema, "bscale_single_pattern")
        # counting single pattern occurrences
        for item in single_pattern_table:
            word = item['single_pattern']
            count = counter[word]
            self.__postgre_db.update(schema, "bscale_single_pattern", "count=" + str(count), "single_pattern=" + add_quotes(word))

        # pmi calculation
        co_occ_table = self.__postgre_db.get_data_from_table(schema, "correlating_pattern")
        for item in co_occ_table:
            item_id = item['id']
            co_occ_freq = float(item['count'] / corpus_count)
            word1_id = item['pattern_a']
            word2_id = item['pattern_b']
            word1_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word1_id), "count")
            print(word1_occ)
            word2_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word2_id), "count")
            print(word2_occ)
            pmi = log2(co_occ_freq / (float(word1_occ / corpus_count) * float(word2_occ / corpus_count)))
            print(pmi)
            self.__postgre_db.update(schema, "correlating_pattern", "pmi=" + str(pmi), "id=" + str(item_id))
def clean_raw_txt(body, headline, punct_dct=None, stopwrds_set=None): 
    """Clean the body and headline to remove punctuation, stopwords, etc.

    Args: 
    ----
        body: str
        headline: str
        punct_dct (optional): dict 
            Translation dict resulting from a `str.maketrans()` call             
        stopwords_set (optional): set  

    Return: 
    ------
        (body, headline): tuple
    """

    if punct_dct: 
        body = body.translate(punct_dct)
        headline = headline.translate(punct_dct)

    body_wrds = word_tokenize(body)
    headline_wrds = word_tokenize(headline)

    stopwrds_set = set() if stopwrds_set is None else stopwrds_set

    body_wrds = [wrd.lower() for wrd in body_wrds if wrd.lower() not in stopwrds_set] 
    headline_wrds = [wrd.lower() for wrd in headline_wrds if wrd.lower() not in stopwrds_set]

    return (body_wrds, headline_wrds)
Example #24
0
def _doc2vec_doc_stream(paths, n, sentences=True):
    """
    Generator to feed sentences to the dov2vec model.
    """
    phrases = Bigram()

    i = 0
    p = Progress()
    for path in paths:
        with open(path, 'r') as f:
            for line in f:
                i += 1
                p.print_progress(i/n)

                # We do minimal pre-processing here so the model can learn
                # punctuation
                line = line.lower()

                if sentences:
                    for sent in sent_tokenize(line):
                        tokens = word_tokenize(sent)
                        yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i)])
                else:
                    tokens = word_tokenize(line)
                    yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i)])
Example #25
0
    def __init__(self, txt_type: str, txt: str):
        self.txt_type = txt_type

        if txt_type is "paragraph":
            self.sentences = [word_tokenize(w) for w in sent_tokenize(txt)]
        else:
            self.title = word_tokenize(txt)
Example #26
0
def load_sick2014(dsfile, mode='relatedness'):
    """ load a dataset in the sick2014 tsv .txt format;

    mode='relatedness': use the sts relatedness score as label
    mode='entailment': use -1 (contr.), 0 (neutral), 1 (ent.) as label """
    s0 = []
    s1 = []
    labels = []
    with open(dsfile) as f:
        first = True
        for line in f:
            if first:
                # skip first line with header
                first = False
                continue
            line = line.rstrip()
            pair_ID, sentence_A, sentence_B, relatedness_score, entailment_judgement = line.split('\t')
            if mode == 'relatedness':
                label = float(relatedness_score)
            elif mode == 'entailment':
                if entailment_judgement == 'CONTRADICTION':
                    label = -1
                elif entailment_judgement == 'NEUTRAL':
                    label = 0
                elif entailment_judgement == 'ENTAILMENT':
                    label = +1
                else:
                    raise ValueError('invalid label on line: %s' % (line,))
            else:
                raise ValueError('invalid mode: %s' % (mode,))
            labels.append(label)
            s0.append(word_tokenize(sentence_A))
            s1.append(word_tokenize(sentence_B))
    return (s0, s1, np.array(labels))
Example #27
0
def load_samples(question, prop_labels):
    samples = []
    q = word_tokenize(question)
    for label in prop_labels:
        text = word_tokenize(label.lower())
        samples.append({'qtext': ' '.join(q), 'label': 0, 'atext': ' '.join(text)})    
    return samples
def testing():
    # - tokenize on sentence and word
    ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!"
    print(sent_tokenize(ex_txt))
    print(word_tokenize(ex_txt, language='english'))

    # - stop words (pre-defined by nltk)
    stop_words = set(stopwords.words('english'))
    print(stop_words)
    words = word_tokenize(ex_txt)
    print(words)
    filtered_sent = []
    for w in words:
        if w not in stop_words:
            filtered_sent.append(w)
    print(filtered_sent)
    filtered_sent = [w for w in words if not w in stop_words]
    print(filtered_sent)

    # - stemming
    ps = PorterStemmer()
    example_words = [python,pythoner,pythoning,pythoned,pythonly]
    # for w in example_words:
    #     print(ps.stem(w))
    new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
    words = word_tokenize(new_text)
    for w in words:
        print(ps.stem(w))
Example #29
0
def load_anssel(dsfile, subsample0=3):
    """ load a dataset in the anssel csv format;

    subsample0=N denotes that only every N-th 0-labelled sample
    should be loaded; so e.g. N=3 reduces 80k negatives to 28k
    negatives in the training set (vs. 4k positives); N=10k
    gets you just 8k negatives, etc. """
    s0 = []
    s1 = []
    labels = []
    i = 0
    with open(dsfile) as f:
        c = csv.DictReader(f)
        for l in c:
            label = int(l['label'])
            if label == 0 and (i % subsample0) != 0:
                i += 1
                continue
            labels.append(label)
            try:
                qtext = l['qtext'].decode('utf8')
                atext = l['atext'].decode('utf8')
            except AttributeError:  # python3 has no .decode()
                qtext = l['qtext']
                atext = l['atext']
            s0.append(word_tokenize(qtext))
            s1.append(word_tokenize(atext))
            i += 1
    return (s0, s1, np.array(labels))
Example #30
0
def obtaindata(pos_file,neg_file):
     ##read the input files
    short_pos = open(pos_file, "r").read()
    short_neg = open(neg_file, "r").read()

    documents = []  # documents is gonna be a list of tuples that have a line of review and a class (pos or neg)

    for r in short_pos.split('\n'):
        documents.append((r, "pos"))
    for r in short_neg.split('\n'):
        documents.append((r, "neg"))

    all_words = []  # gonna contain all the words in both corpuses combined (nonunique)

    short_pos_words = word_tokenize(short_pos)
    short_neg_words = word_tokenize(short_neg)

    for w in short_pos_words:
        all_words.append(w.lower())
    for w in short_neg_words:
        all_words.append(w.lower())

    all_words = nltk.FreqDist(all_words)
    word_features = list(all_words.keys())[:5000]#gets the top 5000 most common words to use as features
    featuresets = [(find_features(rev,word_features), category) for (rev, category) in documents]
    random.shuffle(featuresets)
    return featuresets
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

ps = PorterStemmer()

example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]

for w in example_words:
    print(ps.stem(w))

new_text = "John jumped and was jumping to jump but jumps often jumply"

print("un stemmed = ", new_text)

words = word_tokenize(new_text)

stop_words = set(stopwords.words("english"))

words_no_stop_words = []

for w in words:
    if w not in stop_words:
        words_no_stop_words.append(w)

print("New Text with no stop words = ", words_no_stop_words)

for words in words_no_stop_words:
    print(" Stemmed = ", ps.stem(words))
Example #32
0
 def ret(text):
     text = word_tokenize(text.lower())
     return [w for w in text if w not in self.stopset + self.punct]
Example #33
0
from nltk.stem.lancaster import LancasterStemmer
import os
import string
import pickle

from gensim import corpora, models, similarities
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

raw_material = [
    line.strip() for line in open('material.txt', 'r').read().split(
        '-=this is the spread line=-')
]
texts_tokenized = [[
    word.lower() for word in word_tokenize(document.decode('utf-8'))
] for document in raw_material]

english_stopwords = stopwords.words('english')
texts_filtered_stopwords = [[
    word for word in document if not word in english_stopwords
] for document in texts_tokenized]

texts_filtered = [[
    word for word in document if not word in string.punctuation
] for document in texts_filtered_stopwords]

st = LancasterStemmer()
texts_stemmed = [[st.stem(word) for word in docment]
                 for docment in texts_filtered]
Example #34
0
for id_summary in range(130, 1615, 1):
    create_users_table = """select description from summary where id = """ + str(
        id_summary) + """;"""
    res = execute_read_query(connect, create_users_table)
    for i in res:
        text1 = i[0]

    corpus = []
    filtered = []
    normal = []
    normal1 = []

    text1 = "".join([ch for ch in text1 if ch not in string.punctuation
                     ])  #удаляем знаки препинания
    corpus = word_tokenize(text1,
                           language="russian")  #делит предложения на слова

    for i in corpus:
        if i in stop_words:
            corpus.remove(
                i
            )  #удаляет стоп слова (предлоги например) надо список слов вручную доработать

    for token in corpus:
        a = corpus.index(token)
        corpus[a] = (morph.parse(token)[0].normal_form
                     )  #приводит слова к их нормальной форме

    experience = 0
    op = corpus.count("опыт")
    if (op > 0):
Example #35
0
        return True
 
    def on_error(self, status):
        print(status)
        return True
 
twitter_stream = Stream(auth, MyListener())
twitter_stream.filter(track=['#dengue', 'dengue'], languages =['pt'])
                             
import pandas as pd
from nltk.tokenize import word_tokenize
 

df = pd.read_json("dados/spotify.json", orient = 'records', lines = True) #Otima opcao de leitura

word_tokenize(" ".join(df['text']), language = 'portuguese')



import json
 
with open('python.json', 'r') as f:
    line = f.readline() # read only the first tweet/line
    tweet = json.loads(line) # load it as Python dict
    print(json.dumps(tweet, indent=4)) # pretty-print



tweet['text']

from textblob import TextBlob as tb
Example #36
0
def normalize_document(pathname, filename):
    document_words = dict()
    path = os.path.join(pathname, filename)
    with open(path, 'r') as document:
        for line in document:
            sentence_to_normalize = line.strip()
            if len(sentence_to_normalize) == 0:
                continue
            print_coloured_bold(
                '\nSentence to stem: ' + sentence_to_normalize + '\n', "red")

            #removing m-dash
            sentence_to_normalize = sentence_to_normalize.replace("–",
                                                                  " ").lower()
            sentence_to_normalize = re.sub("-{2,}", "", sentence_to_normalize)

            #removing contract forms
            if ("'t" in sentence_to_normalize):
                sentence_to_normalize = sentence_to_normalize.replace("'t", "")

            #tokenization
            word_tokens = word_tokenize(sentence_to_normalize)

            #punctuation removal
            word_tokens_filtered = [
                w for w in word_tokens
                if not w in punctuation and not w == "'s"
            ]

            #skip if punctuation within words (except -./) or split if / within word
            word_tokens_noslash = list()
            for w in word_tokens_filtered:
                if not any(char in punctuation.replace("-", "").replace(
                        ".", "").replace("/", "") for char in w):
                    if "/" in w:
                        words = w.split("/")
                        for split in words:
                            if not split == "":
                                word_tokens_noslash.append(split)
                    else:
                        word_tokens_noslash.append(w)

            #leave acronyms and split others in case of .
            word_tokens_dot = list()
            regex = re.compile('(?:[a-z]\.){2,}')
            for w in word_tokens_noslash:
                if (w + "." in sentence_to_normalize and regex.match(w + ".")):
                    word_tokens_dot.append(w)
                elif ("." in w):
                    words = w.split(".")
                    for split in words:
                        if not split == "":
                            word_tokens_dot.append(split)
                else:
                    word_tokens_dot.append(w)

            #stopwords removal (done before stemming, less words to stem)
            stop_words = set(stopwords.words('english'))
            no_stopwords_sentence = [
                w for w in word_tokens_dot if not w in stop_words
            ]

            #digits removal
            sentence_words_nodigits = [
                w for w in no_stopwords_sentence if not w.isdigit()
            ]

            #roman numerals removal
            regex = re.compile('^(?=[MDCLXVI])M*D?C{0,4}L?X{0,4}V?I{0,4}$')
            no_roman_numerals_sentence = [
                w for w in sentence_words_nodigits if not regex.match(w)
            ]

            #one letter words removal
            sentence_words_nosingleletters = [
                w for w in no_roman_numerals_sentence if not len(w) < 2
            ]
            print_coloured_bold("Stop words result", "cyan")
            print(sentence_words_nosingleletters)
            print('\n')

            #stemming
            stemmer = TreeTagger(
                path_to_treetagger='/home/biar/Desktop/ProgettoWIR/treetagger')
            for word in sentence_words_nosingleletters:
                stem = stemmer.tag(word)
                if not (stem[0][1] == "CRD"):
                    if not stem[0][2] == '<unknown>':
                        if '|' in stem[0][2]:
                            first_word = ((stem[0][2]).split('|'))[0]
                            stem[0][2] = first_word
                            if (len(first_word) > 1):
                                w = correct_stemming(stem).lower()
                                if not w in document_words:
                                    document_words[w] = 1
                                else:
                                    document_words[w] += 1
                        else:
                            if (len((stem[0][2]).lower()) > 1):
                                w = correct_stemming(stem).lower()
                                if not w in document_words:
                                    document_words[w] = 1
                                else:
                                    document_words[w] += 1
                    else:
                        w = (stem[0][0]).lower()
                        if not w in document_words:
                            document_words[w] = 1
                        else:
                            document_words[w] += 1
    return document_words
Example #37
0
    if token in stopwords.words('english'): 
        clean_tokens.remove(token) 
freq = nltk.FreqDist(clean_tokens) 
for key,val in freq.items(): 
    print (str(key) + ':' + str(val))
    
    freq.plot(20,cumulative=False)
    from nltk.tokenize import sent_tokenize 
mytext = "Hello Adam, how are you? I hope everything is going well. Today is a good day, see you dude." 
print(sent_tokenize(mytext))
from nltk.tokenize import sent_tokenize 
mytext = "Hello Mr. Adam, how are you? I hope everything is going well. Today is a good day, see you dude." 
print(sent_tokenize(mytext))
from nltk.tokenize import word_tokenize
mytext = "Hello Mr. Adam, how are you? I hope everything is going well. Today is a good day, see you dude."
print(word_tokenize(mytext))

from nltk.tokenize import sent_tokenize
mytext = "Bonjour M. Adam, comment allez-vous? J'espère que tout va bien. Aujourd'hui est un bon jour."
print(sent_tokenize(mytext,"french"))

from nltk.corpus import wordnet
syn = wordnet.synsets("pain")
print(syn[0].definition())
print(syn[0].examples())

from nltk.corpus import wordnet
syn = wordnet.synsets("NLP")
print(syn[0].definition())
syn = wordnet.synsets("Python")
print(syn[0].definition())
Example #38
0
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
uniquewords = {}
labels = [0] * len(titles)
for j in range(len(titles)):
    labels[j] = hash_labels[labelsTrain[j]]
    temp = titles[j].lower()
    temp = re.sub(r'\d+', '', temp)
    tempstr = ""
    for char in temp:
        if char not in string.punctuation:
            tempstr += char
    temp = tempstr
    temp = temp.strip()
    temp = temp.replace('\n', ' ')
    t = word_tokenize(temp)
    temp = [k for k in t if not k in stop_words]
    temp2 = [stemmer.stem(word=word) for word in temp]
    temp3 = [lemmatizer.lemmatize(word=word) for word in temp2]
    titles[j] = ' '.join(temp3)
    for word in temp3:
        if word in uniquewords:
            uniquewords[word] += 1
        else:
            uniquewords[word] = 1
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
unique_word_count_vectorizer = tfidf_vectorizer.fit_transform(titles)
X_train, X_test, Y_train, Y_test = train_test_split(
    unique_word_count_vectorizer, labels, test_size=0.2, random_state=109)
gnb = MultinomialNB()
gnb.fit(X_train.toarray(), Y_train)
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import pickle

filename = 'TFIDF.pickle'
pickle.dump(TFIDF, open(filename, 'wb'))

kalimat = "PERANCANGAN SISTEM PAKAR UNTUK DIAGNOSA PENYAKIT ANAK" 
lower_case = kalimat.lower()
print (lower_case)


punctuation = lower_case.translate(str.maketrans('','',string.punctuation)).strip()
print(punctuation)

tokenize = word_tokenize(punctuation)
print(tokenize)


factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()
hasil_stopword = []
for i in tokenize:
    word = stopword.remove(i)
    if word !='':
        hasil_stopword.append(word)
print(hasil_stopword)


factory = StemmerFactory()
stemmer = factory.create_stemmer()  
Example #40
0
def clean_sentence_stopwords(sentence):
    tokens = word_tokenize(sentence)
    sentence_clean = [w for w in tokens if not w in stopwords]
    sentence_clean = ' '.join(sentence_clean)
    return sentence_clean
Example #41
0
print('Document file: ' + in_file + ' Saved in ' +
      out_path)  #dispalys to the user the location of the saved file

#Text Preprocessing
print('File :' + in_file + ' submitted for preprocessing')
#change end of sentence character to #
#Enables reconstruction of sentences after stop words removal
text_file = text_file.replace('.\n\n', '#')
text_file = re.sub('[^a-zA-Z0-9\n#]', ' ',
                   text_file)  #remove special and unwanted characters
text_file = text_file.lower(
)  #convert all text to lower case; facilitates stop word removal

#remove stop words and converts words to their stem/root
stop_words = set(stopwords.words('english'))  # define stopwords object
word_tokens = word_tokenize(
    text_file)  #split text to words, word_tokens is a list
#removes all stopwords from word tokens, filtered_sentence is a list

#Alternative code for removal of stopwords using "a list comprehension"
#filtered_sentence = [w for w in word_tokens if not w in stop_words]

#define a list to hold test whose stop words have been removed
filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        #stemword is user defined function
        #Stemword takes words back to their stem/root
        filtered_sentence.append(stemword(w))

text_file = ' '.join(filtered_sentence)  # convert from list to string
def autocorrect(sent):
    t_text = word_tokenize(sent)
    t_text = [spell(word) for word in t_text]
    input_text = ' '.join(t_text)
    #print(input_text)
    return input_text
def show_entry_fields():
    #------------------------------------------------------------
  #add to dictionary
  def addtodict(list,k):
      for i in list:
       if i in dict:
           if k in dict[i][-1]:
               #print(k)
               dict[i][-1][1]=dict[i][-1][1]+1
           else:
              dict[i] = dict[i]+[[k,1]]
       else:
          dict[i]=[[k,1]]
          

  # ------------------------------------------------
  #tokenize query
  def tokenize_query(s):
      global extras_list
      query1 = word_tokenize(s)
      query = [lemmatizer.lemmatize(w.lower()) for w in query1 if not w in extras_list]
      #print(query)
      return query

  #---------------------------------------------------------

  def find_intersection(lower_list,query):
        intersection=[]
        for i in lower_list:
            #print(i)
            if i in query:
                if i in intersection:
                    continue
                else:
                    intersection.append(i)          
            else:
                continue
        #print(intersection)      
        return intersection 

   #-------------------------------------------------------------
  def document_tfidf(document,intersection,k):
          weight=[]
          for i in intersection:
              x=len(dict[i])
              for j in range(0,x):
                  if(dict[i][j][0]==k):
                      weight.append(1+math.log10(dict[i][j][1]))
          
          for i in document:
              if i in intersection:
                  continue
              else:
                  x=len(dict[i])
                  for j in range(0,x):
                      if(dict[i][j][0]==k):
                          weight.append(1+math.log10(dict[i][j][1]))
          #weighted values before normalization
          #print(weight)
          
          n_value = normalise(weight)
          n=0
          for k in weight:
              weight[n]=k/n_value
              n=n+1                      
          return weight

   #--------------------------------------------------------------
  def minimize_doc(document):
      listed = []
      for i in document:
          if i in listed:
              continue
          else:
              listed.append(i)
      return listed    

   #------------------------------------------------------------

  def normalise(normal):
      sum = 0
      for k in normal:
          sum = sum + (k*k)
      value = math.sqrt(sum)
      return float(value)


  #--------------------------------------------------
  # for a query


  def add_query(query):
      for i in query:
          if i in query_dict:
              query_dict[i]=query_dict[i]+1
          else:
              query_dict[i]=1

  #---------------------------------------------------------------                                     
  #---------------------------------------------------------------


  def query_tf(query_listed):
      for i in query_listed:
          q_tf[i] = 1+math.log10(query_dict[i])
          

  #---------------------------------------------------------------

  def query_idf(query_listed):
      for i in query_listed:
          if i not in dict:
              q_idf[i] = 0
          else:
              q_idf[i] = math.log10((doc_count/len(dict[i])))

  #---------------------------------------------------------------


  #heap sort
  def heapify(arr, n, i):
      largest = i 
      l = 2 * i + 1    
      r = 2 * i + 2     

      if l < n and arr[i] < arr[l]:
          largest = l
          
      if r < n and arr[largest] < arr[r]:
          largest = r
   
      if largest != i:
          arr[i],arr[largest] = arr[largest],arr[i]
          heapify(arr, n, largest)

  def heapSort(arr):
      n = len(arr)
      for i in range(n, -1, -1):
          heapify(arr, n, i)
   
      for i in range(n-1,-1, -1):
          arr[i], arr[0] = arr[0], arr[i]  
          heapify(arr, i, 0)
    

  #---------------------------------------------------------------

  f=[]
  #print(query_listed)
  #print(query_dict)
  #print("tf of query") 

  #-----------------------------------------------------

  for (dirpath,dirnames,filenames) in walk('G:/Users/avina/Desktop/lol'):
      f.extend(filenames)
      
      doc_count = len(f)
  for k in f:
         print(k) 
         fo = open(k,"r+",encoding="utf8")
          
         data = fo.read()

         list = word_tokenize(data)
         lower_list = [lemmatizer.lemmatize(w.lower()) for w in list if not w in extras_list]  
         
         addtodict(lower_list,k)
         fo.close()

  #-------------------------------------------------------------------------------
  s = e1.get() # INPUT HERE<-------------------------------S-------------------------------------->     
  query = tokenize_query(s)   
  add_query(query)
  query_listed = minimize_doc(query)
  query_tf(query_listed)

  #-------------------------------------------------------------------------------
  query_idf(query_listed)
  #idf of query
  #print(q_idf)
 

  tf_idf = {}

  for i in q_tf:
      
      tf_idf[i] = (q_tf[i]*q_idf[i])

  #tf idf value before normalization    
  #print(tf_idf)
  

  sum = 0
  for i in tf_idf:
      sum = sum+tf_idf[i]*tf_idf[i]

  #normalized value   
  value = math.sqrt(sum)
  #print(value)
  

  for i in tf_idf:
        if value==0:
          print("No documents found")
        else:  
           tf_idf[i]=tf_idf[i]/value
        

  #----------------------------------------------
  #normalized tf idf value
  #print(tf_idf)
  
  # tf_idf dictionary contains the tf_idf values of query
  #------------------------------------------------------------
  #-------------------------------------------------------------

  for i in query_listed:   
      if i in dict:
          local_len = len(dict[i])
          for j in range(0,local_len):
              file_name = dict[i][j][0]
              if file_name in document:
                  continue
              else:
                  document.append(dict[i][j][0])
      else:
          continue

  #print(document)    
  #-------------------------------------------------------------

  for k in document:
         fo = open(k,"r+",encoding="utf8")
        # print(k)
         data = fo.read()

         list = word_tokenize(data)
         lower_list = [lemmatizer.lemmatize(w.lower()) for w in list if not w in extras_list]  
      
         fo.close()
    
         listed = minimize_doc(lower_list)
         intersection = find_intersection(lower_list,query)
  #  print("intersection values")
  # print(intersection)
 

         
         weight = document_tfidf(listed,intersection,k)
  #       print("weighted values of document after normalization")
  #       print(weight)
 

         total = [] 
         ins_len = len(intersection)
         l=0
         cosine = 0
         for i in intersection:
             total.append(weight[l]*tf_idf[i])
             cosine = cosine+total[l]
             l = l+1
   #      tf idf of common values between document and query    
   #      print(total)
   
   #      total cosine value
   #      print(cosine)
         cosine_list.append(cosine)
         cosine_dict[cosine] = k

  #print(cosine_dict)       

  n = len(cosine_list)
  heapSort(cosine_list)
  #print(cosine_list)
  count=0
  # print("First Name: %s\nLast Name: %s" % (e1.get(), e2.get()))
  ll = []
  for i in range(n-1,-1,-1):
     if count>10:
         break
     else:  
      f1=cosine_list[i]
      ll.append(cosine_dict[f1])
      count = count+1
  b1=Label(root, text=ll[0],bg='lightblue',font='10',height=2,width=15).grid(row=5)   
  b2=Label(root, text=ll[1],bg='lightblue',font='10',height=2,width=15).grid(row=6)
  b3=Label(root, text=ll[2],bg='lightblue',font='10',height=2,width=15).grid(row=7)
  b4=Label(root, text=ll[3],bg='lightblue',font='10',height=2,width=15).grid(row=8)
  b5=Label(root, text=ll[4],bg='lightblue',font='10',height=2,width=15).grid(row=9)
  b6=Label(root, text=ll[5],bg='lightblue',font='10',height=2,width=15).grid(row=10)
  b7=Label(root, text=ll[6],bg='lightblue',font='10',height=2,width=15).grid(row=11)
  b8=Label(root, text=ll[7],bg='lightblue',font='10',height=2,width=15).grid(row=12)
  b9=Label(root, text=ll[8],bg='lightblue',font='10',height=2,width=15).grid(row=13)
  b10=Label(root, text=ll[9],bg='lightblue',font='10',height=2,width=15).grid(row=14)
Example #44
0
 def change_token(texts):
     tokens = word_tokenize(texts)
     return tokens
Example #45
0
import nltk
import io
from nltk.tokenize import sent_tokenize, word_tokenize

with io.open('filename.txt', 'r', encoding="UTF8") as myfile:
    data=myfile.read().replace('\n', '')

text = word_tokenize(data)
finished = nltk.pos_tag(text)

print(finished)
def my_tokenizer(doc):
    
    text = word_tokenize(doc)
    tokens_without_sw= [word for word in text if not word in all_stopwords]
    
    return tokens_without_sw
Example #47
0
    def on_data(self, data):
        try:
          
           tweet1=data.split(',"text":"')[1]
           tweet2=tweet1.split(',"source":"')[0]
           tweet3=tweet2.split('https:')[0]
           tweet4=tweet3.split(':')[1]
           tweet5=tweet4.replace('RT','')
           tweet6=tweet5.replace(':','')
           tweett=tweet6.replace('@','')
           xx=detect(tweett)
           
           if(xx=='en'):
               saveFile = open('CristianoGame.txt','a')   
               saveFile.write(tweett)
               saveFile.write('\n')
               saveFile.close()
               words=word_tokenize(tweett)
               #print(words)
               #print('\n')
               fili=[]
              
               for w in words:
                   if w not in stop_words:
                       fili.append(w)

               fili2=[]

               #print(fili)

              # print("TABDIL")

               for w in fili:
                   fili2.append(lemmatizer.lemmatize(w))

               #print(fili2)
                       
                #further checking
               end=time.clock()
               zaman=end-start
               str1=" ".join(str(e) for e in fili2)
               #print(str1)
               analysis=TextBlob(str1)
               analysis2=s.sentiment(str1)
               
               adad=analysis.sentiment.polarity
               print(adad)
               #print(adad)
               #xar[0]=x
               #print(zaman)
               x=adad
               y=zaman
               liste.append(adad)
               niste.append(zaman)
               print(len(liste))
               #print("HI")
               #print(ii)
               #print("HI2")
               #alpha=open('file2.txt','a')
               #print >>alpha, adad
               #print("HI")
               #saveFile.write(tweett)
               #xar.append(x)
               #print(zaman)
               #yar.append(y)
               #ax1.clear()
               #ax1.plot(xar,yar)
               #print("HI")


           if(len(liste)==5000):
               print(liste[0])
               print(" 150 ta !!! " )
               print(np.mean(liste))
               print("AVERAGE")
               print(sum(liste))
               print("SUM")
               print(zaman)
               plt.plot(niste, liste,'-o')
               plt.title('Cristiano-1500')
               plt.xlabel('Time(sec)')
               plt.ylabel('SentimentValue')
               
               plt.show()

           if(len(liste)==30000):
               print(liste[0])
               print(" 300 ta !!! " )
               print(np.mean(liste))
               print("AVERAGE")
               print(sum(liste))
               print("SUM")
               print(zaman)
               plt.plot(niste, liste,'-o')
               plt.title('Messi-300')
               plt.xlabel('Time(sec)')
               plt.ylabel('SentimentValue')
               
               plt.show()
               
           if(len(liste)==3000):
               print(liste[0])
               print(" 500 ta !!! " )
               print(np.mean(liste))
               print("AVERAGE")
               print(sum(liste))
               print("SUM")
               print(zaman)
               plt.plot(niste, liste,'-o')
               plt.title('Cristiano-500')
               plt.xlabel('Time(sec)')
               plt.ylabel('SentimentValue')
               
               plt.show()
           
           if(len(liste)==11000):
               print(liste[0])
               print(" 1000 ta !!! " )
               print(np.mean(liste))
               print("AVERAGE")
               print(sum(liste))
               print("SUM")
               print(zaman)
               plt.plot(niste, liste,'-o')
               plt.title('Cristiano-1000')
               plt.xlabel('Time(sec)')
               plt.ylabel('SentimentValue')
               
               plt.show()

           if(len(liste)==20000):
               print(liste[0])
               print(" 2000 ta !!! " )
               print(np.mean(liste))
               print("AVERAGE")
               print(sum(liste))
               print("SUM")
               print(zaman)
               plt.plot(niste, liste,'-o')
               plt.title('Messi-2000')
               plt.xlabel('Time(sec)')
               plt.ylabel('SentimentValue')
               plt.show()

           if(len(liste)==30000):
               print(liste[0])
               print(" 3000 ta !!! " )
               print(np.mean(liste))
               print("AVERAGE")
               print(sum(liste))
               print("SUM")
               print(zaman)
               plt.plot(niste, liste,'-o')
               plt.title('Messi-3000')
               plt.xlabel('Time(sec)')
               plt.ylabel('SentimentValue')
               plt.show()
               #alaf = open('twik.txt','a')
               '''
               for p in liste:
                   alaf.write("%f\n" % p)
                   #alaf.write('\n')
               '''
               #alaf.close()
               #for itm in liste:
                #   print>>
           

           #print(len(liste))
           return True
   
        except:
            return True
file=open("/home/owner/PhD/dr.norbert/dataset/shorttext/agnews/semisupervised/agnewsraw_ensembele_train","r")
lines = file.readlines()
file.close()


train_data = []
train_labels = []
train_trueLabels = []

train_textdata = []

for line in lines:
 line=line.lower().strip() 
 arr = re.split("\t", line)
 train_data.append(arr[2])
 word_tokens = word_tokenize(arr[2])
 train_textdata.append(word_tokens)
 train_labels.append(arr[0])
 train_trueLabels.append(arr[1])

list_toktextdatas.append(train_textdata)

 
#file=open("D:/PhD/dr.norbert/dataset/shorttext/stackoverflow/semisupervised/stackoverflowraw_ensembele_test","r")  
file=open("/home/owner/PhD/dr.norbert/dataset/shorttext/agnews/semisupervised/agnewsraw_ensembele_test","r") 
#file=open("D:/PhD/dr.norbert/dataset/shorttext/data-web-snippets/semisupervised/data-web-snippetsraw_ensembele_test","r")
#file=open("D:/PhD/dr.norbert/dataset/shorttext/biomedical/semisupervised/biomedicalraw_ensembele_test","r")  

lines = file.readlines()
file.close()
X_train = []
Y_train = []
count = 0
for l in lines:
    # if count > 100:
    # 	break
    # count += 1
    x, y = l.split(' ')
    Y_train.append(y)
    temp = open(path2 + x, 'r')
    temp = temp.read()
    # tokens=wordpunct_tokenize(str(temp))
    # tokens = [w for w in tokens if not w in stop_words]
    # doc = [word for word in tokens if word in model.wv.vocab]
    # doc_mean = np.mean(model.wv[doc], axis=0)
    temp = word_tokenize(temp.lower())
    v = model.infer_vector(temp)
    X_train.append(v)

# In[5]:

print("train file input and preprocess")

with open(path3 + 'temp.txt') as f:
    lines = f.readlines()

X_test = []
Y_test = []
count = 0
for l in lines:
    # if count > 100:
def main():
    parser = argparse.ArgumentParser(
        description='Synthetic divergent data creation')
    parser.add_argument('--debug', help='debug mode', action='store_true')
    parser.add_argument('--data', help='input positive examples')
    parser.add_argument('--output',
                        help='output directory of synthetic training data',
                        default='synthetic')
    parser.add_argument(
        '--mode',
        help=
        'how data examples are generated (p: parallel, u:uneven, i:insert, r:replace d:delete',
        default='i')
    parser.add_argument('--pretrained_bert',
                        help='pretrained bert',
                        default='bert-base-cased')
    parser.add_argument(
        '--bert_local_cache',
        help='path to local directory where pretrained bert is saved')

    o = parser.parse_args()
    d = synthetic_divergences()

    # Create directory for bert local cache
    if not os.path.exists(o.bert_local_cache):
        os.makedirs(o.bert_local_cache)

    pos_to_wrd = defaultdict(list)
    indices = []
    with io.open(o.data, 'r', encoding='utf-8', newline='\n',
                 errors='ignore') as f:
        i = 0
        n_total = 0
        for line in f:
            n_total += 1
            if n_total % 100000 == 0:
                if n_total % 1000000 == 0:
                    sys.stderr.write(str(n_total))
                else:
                    sys.stderr.write(".")

            indices.append(i)
            tok = line.strip('\n').split("\t")
            src = tok.pop(0).strip().split(' ')
            tgt = tok.pop(0).strip().split(' ')
            ali = tok.pop(0).strip().split(' ')
            src = word_tokenize(' '.join(src))
            tagged_sent = nltk.pos_tag(src)
            words, tags = zip(*tagged_sent)
            pos = list(tags)
            d.add(src, tgt, pos, ali)
            pos_phrases_ngrams(src, pos, pos_to_wrd)
            i += 1

    # Configure write mode and output files
    write_mode = 'w'

    output_path = os.path.join(o.output,
                               'from_{0}'.format(str(o.data.split('/')[-1])))

    # Create output directories
    try:
        os.makedirs(output_path)
    except FileExistsError:
        sys.stderr.write('Warning: Output file already exists\n')

    if 'g' in o.mode:
        lm_model = BertForMaskedLM.from_pretrained(
            o.pretrained_bert, cache_dir=o.bert_local_cache)
        lm_tokenizer = BertTokenizer.from_pretrained(
            o.pretrained_bert, cache_dir=o.bert_local_cache)
        output_g = open(os.path.join(output_path, 'generalization'),
                        write_mode)
        output_g_span = open(os.path.join(output_path, 'generalization.span'),
                             write_mode)
    if 'p' in o.mode:
        lm_model = BertForMaskedLM.from_pretrained(
            o.pretrained_bert, cache_dir=o.bert_local_cache)
        lm_tokenizer = BertTokenizer.from_pretrained(
            o.pretrained_bert, cache_dir=o.bert_local_cache)
        output_p = open(os.path.join(output_path, 'particularization'),
                        write_mode)
        output_p_span = open(
            os.path.join(output_path, 'particularization.span'), write_mode)
    if 'i' in o.mode:
        output_i = open(os.path.join(output_path, 'insert'), write_mode)
        output_i_span = open(os.path.join(output_path, 'insert.span'),
                             write_mode)
    if 'u' in o.mode:
        output_u = open(os.path.join(output_path, 'uneven'), write_mode)
        output_u_span = open(os.path.join(output_path, 'uneven.span'),
                             write_mode)
    if 'd' in o.mode:
        output_d = open(os.path.join(output_path, 'delete'), write_mode)
        output_d_span = open(os.path.join(output_path, 'delete.span'),
                             write_mode)
    if 'r' in o.mode:
        output_r = open(os.path.join(output_path, 'replace'), write_mode)
        output_r_span = open(os.path.join(output_path, 'replace.span'),
                             write_mode)

    for i in indices:

        # Insert sentence
        if 'i' in o.mode:
            synthetic_pair = d.insert_pair(i, o)
            if synthetic_pair:
                output_i.write('{0}\t{1}\n'.format(' '.join(
                    synthetic_pair[0]), ' '.join(synthetic_pair[1])))
                output_i_span.write('{0}\t{1}\n'.format(
                    ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3])))

            else:
                output_i.write(none_)
                output_i_span.write(none_)

        # Random pairing of sentences
        if 'u' in o.mode:
            synthetic_pair = d.uneven_pair(i, o)
            if synthetic_pair:
                output_u.write('{0}\t{1}\n'.format(' '.join(
                    synthetic_pair[0]), ' '.join(synthetic_pair[1])))
                output_u_span.write('{0}\t{1}\n'.format(
                    ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3])))
            else:
                output_u.write(none_)
                output_u_span.write(none_)

        # Create lexical substitution (generalization) instance
        if 'g' in o.mode:
            synthetic_pair = d.generalization_pair(i, o, lm_model,
                                                   lm_tokenizer)
            if synthetic_pair:
                output_g.write('{0}\t{1}\n'.format(' '.join(
                    synthetic_pair[0]), ' '.join(synthetic_pair[1])))
                output_g_span.write('{0}\t{1}\n'.format(
                    ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3])))
            else:
                output_g.write(none_)
                output_g_span.write(none_)

        # Create lexical substitution (particularization) instance
        if 'p' in o.mode:
            synthetic_pair = d.particularization_pair(i, o, lm_model,
                                                      lm_tokenizer)
            if synthetic_pair:
                output_p.write('{0}\t{1}\n'.format(' '.join(
                    synthetic_pair[0]), ' '.join(synthetic_pair[1])))
                output_p_span.write('{0}\t{1}\n'.format(
                    ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3])))
            else:
                output_p.write(none_)
                output_p_span.write(none_)

        # Create subtree deletion instance
        if 'd' in o.mode:
            synthetic_pair = d.delete_pair(i, o)
            if synthetic_pair:
                output_d.write('{0}\t{1}\n'.format(' '.join(
                    synthetic_pair[0]), ' '.join(synthetic_pair[1])))
                output_d_span.write('{0}\t{1}\n'.format(
                    ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3])))
            else:
                output_d.write(none_)
                output_d_span.write(none_)

        # Create phrase replacement instance
        if 'r' in o.mode:
            synthetic_pair = d.replace_pair(i, o, pos_to_wrd)
            if synthetic_pair:
                output_r.write('{0}\t{1}\n'.format(' '.join(
                    synthetic_pair[0]), ' '.join(synthetic_pair[1])))
                output_r_span.write('{0}\t{1}\n'.format(
                    ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3])))
            else:
                output_r.write(none_)
                output_r_span.write(none_)
Example #51
0
    sumat_v1 = sumat_v1**0.5
    if sumat_v1 == 0:
        sumat_v1 = 1
    sumat_v2 = sumat_v2**0.5
    if sumat_v2 == 0:
        sumat_v2 = 1

    measure = sumat_pto / (sumat_v1 * sumat_v2)
    return measure


directorio = os.listdir('Datosrank/')

query = "31483txt start project gutenberg ebook gaslight sonata produced suzanne shell"
stop_words = set(stopwords.words('english'))
tok_query = word_tokenize(query)
final_query = [word for word in tok_query if not word in stop_words]

with open('pal.json') as json_file:
    pal = json.load(json_file)

print("Acabe la larga espera pal")

with open('queryw.json') as json_file:
    queryw = json.load(json_file)

print("Acabe la larga espera query")
listidx = []
vectorw_query = []
for word in final_query:
    idx = pal.index(word)
                     )  # membuang kata yang hanya satu huruf dari awal
        tmp = re.sub(r'\s+', ' ', str(fitur_ekstraksi2[cuitan])
                     )  # mengganti spasi ganda dengan spasi tunggal
        fitur_ekstraksi3.append(tmp)

    fitur_ekstraksi5 = []
    for cuitan in range(0, len(fitur_ekstraksi3)):
        tmp = word_tokenize(str(fitur_ekstraksi3[cuitan]))
        fitur_ekstraksi5.append(tmp)

    return fitur_ekstraksi5


stopsunda1 = open('stopwordv1.txt', 'r')
stopsunda2 = stopsunda1.read()
stopsunda = word_tokenize(stopsunda2)


def swr(a, b):
    filtered_sentence = []
    for w in a:
        if w not in b:
            filtered_sentence.append(w)
    return filtered_sentence


callbackvalue = preprocessing(fitur)


def stopw(datanext):
    fitur_ekstraksistop = []
def get_syllable_count(text):
	return (sum(map(lambda w: allnumsyllables(w), word_tokenize(text))))
Example #54
0
    if not os.path.isfile('featureVectorForSentence.csv'):
        open('featureVectorForSentence.csv', 'w')
    with open('featureVectorForSentence.csv', 'w') as featuresFile:
        featuresFile.write('')

    s = LancasterStemmer()
    unwantedWordes = [
        'the', 'a', 'is', 'was', 'are', 'were', 'to', 'at', 'i', 'my', 'on',
        'me', 'of', '.', 'in', 'that', 'he', 'she', 'it', 'by'
    ]
    for i in range(0, a - 1):
        lexicon_dictionary[i][0] = s.stem(lexicon_dictionary[i][0])

    for x in sentences:
        featureVector = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        words = word_tokenize(x)
        for y in words:
            y = s.stem(y)
            y = y.lower()
            if y in unwantedWordes != -1:
                continue
            for i in range(0, a - 1):
                if y == lexicon_dictionary[i][0]:
                    for j in range(0, 10):
                        featureVector[j] = featureVector[j] + int(
                            lexicon_dictionary[i][j + 1])
                    break
        # write this feature vector to featureVectors File
        for k in range(0, 9):
            with open('featureVectorForSentence.csv', 'a') as featuresFile:
                featuresFile.write(str(featureVector[k]) + ',')
Example #55
0
def find_features(document, word_features):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features
def get_word_count(text):
    word_count=0
    filterwords=filter(not_punctuation, word_tokenize(text))
    for word in filterwords:
        word_count=word_count+1
    return word_count
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
from scipy import spatial
from sklearn.manifold import TSNE
from multiprocessing import Array
from bs4 import BeautifulSoup



fi = 'test_data.txt'

printable = set(string.printable)
# stop words from nltk
stop_words = set(stopwords.words("english"))

file_con = open(fi).read().lower()
file_content = filter(lambda x: x in printable, file_con)

example_words = word_tokenize(file_content)
# removing punctuations
example_words = filter(lambda x: x not in string.punctuation, example_words)
# removing stop_words
cleaned_text = filter(lambda x: x not in stop_words, example_words)
print(cleaned_text)
cleaned_t = " ".join(cleaned_text)
f = open('cleaned_test_data.txt', 'w')
f.write(cleaned_t)
def text_statistics(text):
    word_count = get_word_count(text)
    sent_count = get_sent_count(text)
    syllable_count = sum(map(lambda w: allnumsyllables(w), word_tokenize(text)))
    return word_count, sent_count, syllable_count
def s2s_preprocess(train_file_name, test_file_name):
    raw_sentences = list()
    ontology_results = list()
    max_length = 0
    for one_line in open(train_file_name):
        one_line = one_line.strip()
        print(one_line)
        if len(one_line.split("\t")) != 2:
            continue
        raw_sentence = one_line.split("\t")[0]
        ontology_string = one_line.split("\t")[1]
        tokenized_list = word_tokenize(raw_sentence)
        if len(tokenized_list) > max_length:
            max_length = len(tokenized_list)
        ontology_tuple = ontology_string.split()
        if len(ontology_tuple) != 3:
            continue
        raw_sentences.append(tokenized_list)
        ontology_results.append(ontology_tuple)

    token_voc_list = list()
    ontology_voc_list = list()

    for one_raw_sentence in raw_sentences:
        for one_token in one_raw_sentence:
            token_voc_list.append(one_token)
    token_voc_list = list(set(token_voc_list))

    for one_ontology_result in ontology_results:
        for one_ontology in one_ontology_result:
            ontology_voc_list.append(one_ontology)
    ontology_voc_list = list(set(ontology_voc_list))

    token_idx_dict, idx_token_dict = dictionary_generator(token_voc_list, eos_flag=False)
    ontology_idx_dict, idx_ontology_dict = dictionary_generator(ontology_voc_list, oov_flag=False)

    token_store_data = list()
    for one_raw_sentence in raw_sentences:
        token_store_data.append(data_indexer(one_raw_sentence, token_idx_dict))

    ontology_store_data = list()
    for one_ontology_result in ontology_results:
        ontology_store_data.append(data_indexer(one_ontology_result, ontology_idx_dict))

    pretrained_dict = dict()

    print("Loading pretrained Word2Vec model ...")
    w2v_embedding_path = "data/w2v/wiki20170101"
    w2v_model = Word2Vec.load(w2v_embedding_path)

    for one_line in open(train_file_name):
        one_line = one_line.strip()
        if len(one_line.split("\t")) != 2:
            continue
        raw_sentence = one_line.split("\t")[0]
        tokenized_list = word_tokenize(raw_sentence)
        for one_token in tokenized_list:
            if one_token not in w2v_model.wv.vocab:
                continue
            pretrained_dict[one_token] = w2v_model[one_token]

    for one_line in open(test_file_name):
        one_line = one_line.strip()
        if len(one_line.split("\t")) != 2:
            continue
        raw_sentence = one_line.split("\t")[0]
        tokenized_list = word_tokenize(raw_sentence)
        for one_token in tokenized_list:
            if one_token not in w2v_model.wv.vocab:
                continue
            pretrained_dict[one_token] = w2v_model[one_token]


    processed_data = (token_idx_dict,
                      idx_token_dict,
                      ontology_idx_dict,
                      idx_ontology_dict,
                      pretrained_dict,
                      token_store_data,
                      ontology_store_data,
                      raw_sentences,
                      ontology_results)
    pickle.dump(processed_data, open("data/preprocessed/20180405.pkl", "wb"))
 def __tokenize(self, a_txt: str = ''):
     self.Tokens = word_tokenize(a_txt)
     print(self.Tokens)