Example #1
0
def getData(Name,Type,Level):
    #type
    if(Type=='text_file'):
        #readfile
        # file=open(Name)
        # line=""
        # prev="a"
        # while prev!="":
        #     prev=file.readline()
        #     line += prev + " "

        file=open(Name,'r')
        line=file.read()
        line=clean_lower(line,['lower'])    #COnvert to Lower Case
        #level
#         print(line[:100])
        if(Level=='line'):
            genrated= line;
        elif(Level=='char'):
            genrated= line.split("");
        elif(Level=='word'):
            genrated=wt(line) 
        elif(Level=='sentence_word'):
            sentences=st(line)
            genrated=[wt(s) for s in sentences]
        elif(Level=='sentence'):
            genrated=st(line)
    print("Total Sentences: ",len(genrated))
    return genrated
Example #2
0
def scoreline(line1,line2,metric,ic=None):
    sw = stopwords.words('english') # import stopwords
    t1 = wt(line1) # tokenize line1
    t2 = wt(line2) # tokenize line2
    syns1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in t1 if x not in sw]) # get list of synsets for tokens of line1
    syns2 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in t2 if x not in sw]) # get list of synsets for tokens of line2
    runningscore = 0.0
    runningcount = 0
    print "syns1: ", syns1
    print "syns2: ", syns2
    for syn1 in set(syns1): # get Wordnet similarity score for <metric> for each pair created from both synset lists
        for syn2 in set(syns2):
            if ic is not None:
                try:
                    mark = metric(syn1,syn2)
                except:
                    mark = 0.0
                runningscore += mark
            else:
                try:
                    mark = metric(syn1,syn2)
                except:
                    mark = 0.0
            runningcount += 1

    score = runningscore/runningcount # add up individual scores, divide by number of individual scores
    return score # return overall scores
Example #3
0
def sim_overlap(sentence1, sentence2): 
    # lowercase
    sentence1 = sentence1.lower()
    sentence2 = sentence2.lower()
    # remove punctuation
    nopunct_sentence1 = ''.join([c for c in sentence1 
                                        if re.match("[a-z\-\' \n\t]", c)])
    nopunct_sentence2 = ''.join([c for c in sentence2 
                                        if re.match("[a-z\-\' \n\t]", c)])                                         
    # tokenize
    line1 = wt(nopunct_sentence1)
    line2 = wt(nopunct_sentence2)
    # Calculate element numbers of intersection and sentence1
    # combined_line = line1 + line2
    # union_num = len(set(combined_line))
    intersection_num = len(set(line1) & set(line2))
    sentence1_num = len(set(line1))
    # return score = |Q intersect R| / |Q|
    sim = float(intersection_num) / float(sentence1_num)
    return sim


# # Test
# list1 = load_sentences('data_not_sell')
# list2 = load_sentences('data_sell_share')

# sentence1 = list1[0]
# sentence2 = list2[0]

# score = sim_overlap(sentence1, sentence2)
# print score
Example #4
0
    def classify(self, text):
        # Classify a sentence string or a list of sentences as norm and noNorm.

        if type(text) == str:
            # If text is a string, break it into tokens.
            # If among the tokens there is a modal verb, consider it a norm.
            # If it does not have a modal verb among the tokens, consider it a noNorm.

            tokens = wt(text)
            for token in tokens:
                if token in self.modal_verbs:
                    return 'norm'
            return 'noNorm'

        elif type(text) == list and text:
            # If text is a list of sentences, classify each sentence based on the existence or absence of modal verbs among the sentence tokens.
            output = []
            for sent in text:
                classified = 0
                tokens = wt(sent)
                for token in tokens:
                    if token in self.modal_verbs:
                        output.append((sent, 'norm'))
                        classified = 1
                        break

                if not classified:
                    output.append((sent, 'noNorm'))

            return output
    def classify(self, text):
        # Classify a sentence string or a list of sentences as norm and noNorm.

        if type(text) == str:
            # If text is a string, break it into tokens.
            # If among the tokens there is a modal verb, consider it a norm.
            # If it does not have a modal verb among the tokens, consider it a noNorm.
            
            tokens = wt(text)
            for token in tokens:
                if token in self.modal_verbs:
                    return 'norm'
            return 'noNorm'

        elif type(text) == list and text:
            # If text is a list of sentences, classify each sentence based on the existence or absence of modal verbs among the sentence tokens.
            output = [] 
            for sent in text:
                classified = 0
                tokens = wt(sent)
                for token in tokens:
                    if token in self.modal_verbs:
                        output.append((sent, 'norm'))
                        classified = 1
                        break
                
                if not classified:
                    output.append((sent, 'noNorm'))

            return output
Example #6
0
def get_overlap(sent1, sent2):
    sent1 = set(wt(sent1))
    sent2 = set(wt(sent2))
    try:
        value = max(
            len(sent1.intersection(sent2)) / len(sent1.union(sent2)), 0)
        return value
    except:
        return 0
def initialize_terms_and_postings():
    global dictionary, postings
    for id in document_filenames:
        terms = wt(id['description'])
        terms = terms + (wt(id['title']))
        #terms = ps(terms)
        unique_terms = set(terms)
        dictionary = dictionary.union(unique_terms)
        for term in unique_terms:
            postings[term][id['doc_id']] = terms.count(term)
def sem_wsd_corpus(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Get a list of synsets or terms, synsets for the terms whic have synsets, term for the ones don't
    Use internal maximization on corpus either internal word max or internal sentence max

    Return: synset_list (list of strings(terms that meets the POS criteria))
    """
    # get a term based corpus list for compute internal corpus maximization WSD
    corpus_list = []
    for line in line_list:
        line = line.lower()
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)
        line_token = wt(nopunct_line)
        corpus_list = corpus_list+line_token
    corpus_list = list(set(corpus_list))

    # start
    total_synset_sentence_list = []
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line 
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)
        # tokenize
        line_token = wt(nopunct_line)
        # list of wsd synsets
        synset_list = reduce(lambda x,y:x+y, [ [internal_sentence_max_WSD(corpus_list, x)] for x in line_token if wn.synsets(x) ])
        # synset_list = reduce(lambda x,y:x+y, [ [internal_word_max_WSD(corpus_list, x)] for x in line_token if wn.synsets(x) ])
        # format synset into term, e.g. from Synset.share.v.1 -> sharev1
        synset_formatted_list = []
        for synset in synset_list:
            formatted_term = re.sub('[^A-Za-z0-9]+', '', str(synset))
            formatted_term = formatted_term.lstrip('Synset')
            synset_formatted_list.append(formatted_term)
        # list of terms without synset defination
        nonsynset_list = [ x for x in line_token if not wn.synsets(x)]
        # add synset list and nonsynset list together
        total_synset_list = synset_formatted_list + nonsynset_list
        # back to sentence as a string
        total_synset_sentence = ' '.join(total_synset_list)
        total_synset_sentence_list.append(total_synset_sentence)
    return total_synset_sentence_list
def select_by_pos(line_list):
    """
    Input: line_list (list of strings(sentences/documents))

    Iterates over all terms in lines, select terms with meaningful type of POS

    Return: POSed_list (list of strings(terms that meets the POS criteria))
    """
    POSed_list = []
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line 
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # POS 
        pos_line = pos_tag(line_token)
        # filter line using POS info
        # only remain verbs, nouns, adverbs, adjectives
        filtered_line = []
        for tagged_tuple in pos_line:
            term = tagged_tuple[0]
            tag  = tagged_tuple[1]
            # find out all verbs, nouns, adverbs, adjectives
            if tag.startswith('V') or tag.startswith('N') or tag.startswith('R') or tag.startswith('J'):
                filtered_line.append(term)
        # back to sentence as a string
        POSed_sentence = ' '.join(filtered_line)
        POSed_list.append(POSed_sentence)
    return POSed_list
Example #10
0
def Format_Problems(category):
    problems = {}

    #Something we want to delete in the question, in order to do query in Lemur
    Punc = set(string.punctuation)
    Aux_Art = {'is','was','are','were','did','does','do','the','a'}

    if category == "question":
        key = 88 #initialize problem No.
    else:
        key = 0 #initialize problem No.
    
    flag = 0 #indicate if this line is the content of this question
    with open(File_path+ category +'.txt','r') as f:
        for row in f.readlines():
            if row.find('<num>') != -1:
                key = key + 1
            elif row.find('<desc>') != -1:
                flag = 1
            elif flag == 1: #extract the content of the problem
                #delete all punctuations
                content = ''.join(ch for ch in row[0:len(row)-2].replace("'",' ') if ch not in Punc)

                #delete the words we don't wanna include in the query
                question = ''
                for part in wt(content):
                    if part not in Aux_Art:
                        question = question + part + ' '
                
                problems[key] = question[0:len(question)-1] 
                flag = 0

                
    return problems
Example #11
0
def find_features(document):
    words = wt(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features
def pos_tagging(line_list):
    """
    Input: line_list (list of strings(sentences/documents))

    Iterates over all terms in lines, add POS tag to words.
    E.g. 'said' -> ('said', 'VD') -> saidVD

    Return: tagged_list (list of strings(terms that meets the POS criteria))
    """
    tagged_list = []
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line 
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # POS 
        pos_line = pos_tag(line_token)
        # filter line using POS info
        # only remain verbs, nouns, adverbs, adjectives
        tagged_line = []
        for tagged_tuple in pos_line:
            term = tagged_tuple[0]
            tag  = tagged_tuple[1]
            tagged_line.append(term+tag)
        # back to sentence as a string
        tagged_sentence = ' '.join(tagged_line)
        tagged_list.append(tagged_sentence)
    return tagged_list
def stemming(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Iterates over all terms in lines, stem them

    Return: stemmed_list (list of strings(terms that stemmed))
    """
    stemmed_list = []
    stemmer = PorterStemmer()
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # list to store stemmed terms
        stemmed_line = []
        for term in line_token:
            term = stemmer.stem_word(term)
            stemmed_line.append(term)
        # back to sentence as a string
        stemmed_sentence = ' '.join(stemmed_line)
        stemmed_list.append(stemmed_sentence)
    return stemmed_list
def lemmatizing(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Iterates over all terms in lines, lemmatize them using WordNetLemmatizer()

    Return: lemmatized_list (list of strings(terms that stemmed))
    """
    lemmatized_list = []
    lemmatizer = WordNetLemmatizer()
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line 
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # stemming
        lemmatized_line = []
        for term in line_token:
            term = lemmatizer.lemmatize(term)
            lemmatized_line.append(term)
        # back to sentence as a string
        lemmatized_sentence = ' '.join(lemmatized_line)
        lemmatized_list.append(lemmatized_sentence)
    return lemmatized_list
def pos_lemmatizing(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Iterates over all terms in lines, lemmatize them using WordNetLemmatizer()
    Terms are pre-processed using POS tagging to improve accuracy

    Return: lemmatized_list (list of strings(terms that stemmed))
    """
    lemmatized_list = []
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        
        # Having punctuation removal before POS seems to be a bad idea
        # # remove punctuation
        # # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # # line = ''.join([c for c in line 
        #                                     # if re.match("[a-z\-\' \n\t]", c)])
        # # this solve the problem above:
        # line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        
        # tokenize
        line_token = wt(line)
        # POS 
        pos_line = pos_tag(line_token)
        # list for all lemmatized terms
        lemmatized_line = []
        for lemma in pos_line:
            term = wn_lemmatize(lemma)
            lemmatized_line.append(term)
        # back to sentence as a string
        lemmatized_sentence = ' '.join(lemmatized_line)
        lemmatized_list.append(lemmatized_sentence)
    return lemmatized_list
Example #16
0
    def cacs(self):
        inputWord = []
        for i in self.cleaned_1[0]:
            for j in wt(i):
                inputWord.append(j)

        tew = self.totalEngWords()
        engWord = []
        try:
            for i in inputWord:
                if len(i) > 2:
                    for j in tew:
                        if i == j:
                            engWord.append(i)
        except:
            print 'Error'
        engWord = list(set(engWord))
        fa = self.finalAppend()
        correct = fa[1]
        fw = self.finalWrong()
        fWrong = fw[0]
        fCorrect = fw[1]
        cleanedData = []
        for i in self.cleaned_1[0]:
            new_sentence = []
            for j in wt(i):
                count = 0
                for k in engWord:
                    if j == k:
                        count += 1
                if count > 0:
                    count_new = 1
                    for l in correct:
                        if j == l:
                            new_sentence.append(j.lower())
                            count_new = 0
                            break
                    if count_new:
                        for l in fWrong:
                            if j == l:
                                new_sentence.append(fCorrect[fWrong.index(l)])
                                break
                else:
                    new_sentence.append(j.lower())
            new_sentence = " ".join(new_sentence)
            cleanedData.append(new_sentence)
        return cleanedData
def scoreline(line1,line2,metric,ic=None): 
    sw = stopwords.words('english') # import stopwords 
    t1 = wt(line1) # tokenize line1
    t2 = wt(line2) # tokenize line2

    # for x in t1:
    #     if x not in sw:
    #         print x, wn.synsets(x)

    # get list of synsets for tokens of line1 
    syns1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in t1 if x not in sw])
    # syns1 = reduce(lambda x,y:x+y,[ wn.synsets(x)[0] for x in t1 if ((x not in sw) and (wn.synsets(x))) ])
    # get list of synsets for tokens of line2 
    # syns2 = reduce(lambda x,y:x+y,[wn.synsets(x)[0] for x in t2 if x not in sw])
    runningscore = 0.0
    runningcount = 0
    print "syns1: ", syns1
Example #18
0
def clean_words(nce):
    nce = nce.replace('’', '\'')
    nce = nce.replace('‘', '\'')
    words = wt(nce)
    words = set([wnl.lemmatize(word) for word in words])
    words = set([stemmer.stem(word) for word in words])

    return set(words)
def sim_overlap_idf(sentence1, sentence2): 
    # remove punctuation
    nopunct_sentence1 = ''.join([c for c in sentence1 
                                        if re.match("[a-z\-\' \n\t]", c)])
    nopunct_sentence2 = ''.join([c for c in sentence2 
                                        if re.match("[a-z\-\' \n\t]", c)])                                         
    # tokenize
    line1 = wt(nopunct_sentence1)
    line2 = wt(nopunct_sentence2)
    
    # intersection: (Q intersect R)
    intersection = set(line1) & set(line2)
    # calculate sum of idfs: Sum(idf_w) for w in (Q intersect R)
    sum_idf = 0.0
    for item in intersection:
        idf = text_collection.idf(item)
        sum_idf += idf
    
    # Calculate element numbers of intersection and sentence1
    intersection_num = len(intersection)
    sentence1_num = len(set(line1))
    # sim = |Q intersect R| / |Q|
    # sim = float(intersection_num) / float(sentence1_num)
    # fix a bug
    try:
        sim = float(intersection_num) / float(sentence1_num)
    except:
        sim = 1
    # sim = ( |Q intersect R| / |Q| ) * Sum(idf_w) for w in (Q intersect R)
    sim = sim * sum_idf
    return sim


# # Test
# list1 = load_sentences('data_not_sell')
# list2 = load_sentences('data_sell_share')

# sentence1 = list1[0]
# sentence2 = list2[0]

# score = sim_overlap_idf(sentence1, sentence2)
# print score
Example #20
0
def build_wiki_data(corpus, vocab , k= 30000):
    mostfreqk = vocab[-k+5:]
    vocabd = dict(zip( [w for w,v in mostfreqk ], range(k-5)))
    
    vocabd['DG'] = k-5
    vocabd['DGDG'] = k-4
    vocabd['DGDGDG'] = k-3
    vocabd['DGDGDGDG'] = k-2
    vocabd['UUUKKKNNN'] = k-1

    assert(len(vocabd.keys()) == k )

    newcorpus = []
    newcorpusf = []
    tokenizer = wt()
    print 'Total documents:',len(corpus)
    for aa,doc in enumerate(corpus):
        print aa,
        sys.stdout.flush()
        strlist = tokenizer.tokenize(doc.lower())
        docidx = []
        docfiltered = []
        for w in strlist:
            try:
                docidx.append(vocabd[w])
                #docfiltered.append(w)
            except:
                if w.isdigit() and len(w) <= 4:
                    docidx.append(k-len(w)-1)
                    #docfiltered.append('DG'*len(w))
                    continue
                else:

                    try:
                        w = remove_accents(w)
                    except:
                        continue

                    if len(w.split("\\x")) > 1:
                        continue
                    elif len(w) <= 1 and w not in string.punctuation:
                        continue

                    elif len(re.sub(r'\W+', '', w)) == 0 and len(w) != 1:
                        continue
                    else:
                        docidx.append(k-1)
                        #docfiltered.append('UUUKKKNNN')


        newcorpus.append(docidx)
        #newcorpusf.append(docfiltered)

    return newcorpus, newcorpusf, vocabd
Example #21
0
def freq_words(file, min=1, num=10):
    text = open(file).read()
    tokens = wt(text)
    print(len(tokens))
    # 734989
    print(len(set(tokens)))
    freqdist = nltk.FreqDist(t for t in tokens if len(t) <= min)
    print(len(freqdist))
    # 4540
    # freqdist.plot()
    return freqdist.keys()
def ner_sents(sents):
	"""
	Given a list of word-tokenized sentences, 
	Returns the tree of the sentences.
	"""

	sentences = []

	for sent in sents:
		sentences.append(NER(POS(wt(sent))))

	return sentences
Example #23
0
def build_wiki_data(corpus, vocab, k=30000):
    mostfreqk = vocab[-k + 5:]
    vocabd = dict(zip([w for w, v in mostfreqk], range(k - 5)))

    vocabd['DG'] = k - 5
    vocabd['DGDG'] = k - 4
    vocabd['DGDGDG'] = k - 3
    vocabd['DGDGDGDG'] = k - 2
    vocabd['UUUKKKNNN'] = k - 1

    assert (len(vocabd.keys()) == k)

    newcorpus = []
    newcorpusf = []
    tokenizer = wt()
    print 'Total documents:', len(corpus)
    for aa, doc in enumerate(corpus):
        print aa,
        sys.stdout.flush()
        strlist = tokenizer.tokenize(doc.lower())
        docidx = []
        docfiltered = []
        for w in strlist:
            try:
                docidx.append(vocabd[w])
                #docfiltered.append(w)
            except:
                if w.isdigit() and len(w) <= 4:
                    docidx.append(k - len(w) - 1)
                    #docfiltered.append('DG'*len(w))
                    continue
                else:

                    try:
                        w = remove_accents(w)
                    except:
                        continue

                    if len(w.split("\\x")) > 1:
                        continue
                    elif len(w) <= 1 and w not in string.punctuation:
                        continue

                    elif len(re.sub(r'\W+', '', w)) == 0 and len(w) != 1:
                        continue
                    else:
                        docidx.append(k - 1)
                        #docfiltered.append('UUUKKKNNN')

        newcorpus.append(docidx)
        #newcorpusf.append(docfiltered)

    return newcorpus, newcorpusf, vocabd
Example #24
0
def get_length():
    train = []
    for sent in sentences:
        #字符长度
        a1 = len(sent)

        #单词个数
        a2 = len(wt(sent))

        #'s个数,也就是所有格个数
        a3 = sent.count('\'s')

        #标点符号的个数 比如.和?
        a4 = sent.count('.') + sent.count('?')

        #最长单词长度
        b = wt(sent)
        a5 = len(max(b))

        train.append([a1, a2, a3, a4, a5])

    return train
Example #25
0
def PreprocessCSV(csvfile, outputfile):
    """
    output a csv file and return a word list.
    """
    print("Start preprocessing %s ..." % csvfile)
    voc = []
    dataframe = pandas.read_csv(csvfile, usecols=["Insult", "Comment"])
    labels = dataframe.iloc[:, 0].tolist()
    sents = dataframe.iloc[:, 1].tolist()
    newsents = []
    for sent in sents:
        # process sentences of samples
        # in case of blank, add a useless flag at the end
        sent = sent.strip("\"").lower()
        sent = sent.replace("\t", " ")
        sent = sent.replace("\n", " ")
        sent = sent.replace("\xa0", " ")
        sent = sent.replace("\xc2", " ")
        sent = sent.replace("\xc8", " ")
        sent = sent.replace("\xec", " ")
        sent = sent.replace("\x80", " ")
        sent = sent.replace("\xa6", " ")
        sent = re.sub("[$%^&*\[\]]", "", sent)
        tks = wt(sent)
        newtks = []

        #built first-part features
        for tk in tks:
            if tk.isalpha():
                tk = wnl().lemmatize(tk)
                newtks.append(tk)
                voc.append(tk)
            else:
                pass
        newsent = " ".join(newtks)
        newsent = newsent + " " + "auselessflag"
        newsents.append(newsent)

    # write the outputfile
    col_order = ["Insult", "Comment"]
    dataframe2 = pandas.DataFrame({"Insult": labels, "Comment": newsents})
    dataframe2.to_csv(outputfile, index=False, columns=col_order)
    fdist = FreqDist(voc)
    keys = fdist.keys()
    wordlist = []
    for key in keys:
        wordlist.append(key)
    print(
        "file \"%s\" is preprocessed, and there are %d keys in the return wordlist."
        % (csvfile, len(wordlist)))
    return wordlist
Example #26
0
def length_ana():
    for indx, nce in enumerate(corpus):
        result = []

        for sent in nce:
            tk = wt(sent)
            result.append(len(tk))
        print(result)
        fi = int(indx / 2) + 1
        fig1 = plt.figure(fi)
        plt.subplot(int("21%s" % (indx % 2 + 1)))
        plt.hist(result)
        plt.xlabel('new concept number %s' % (indx + 1))
    plt.show()
Example #27
0
    def extract_norms(self, contract_sents):
        # Return norms from a list of sentences.
        output = []

        if type(contract_sents) != list:
            contract_sents = self.sent_tokenizer.tokenize(contract_sents)

        for sentence in contract_sents:
            tokens = wt(sentence)
            for token in tokens:
                if token in self.modal_verbs:
                    output.append(sentence)
                    break
        return output
 def extract_norms(self, contract_sents):
     # Return norms from a list of sentences.
     output = []
     
     if type(contract_sents) != list:
         contract_sents = self.sent_tokenizer.tokenize(contract_sents)
     
     for sentence in contract_sents:
         tokens = wt(sentence)
         for token in tokens:
             if token in self.modal_verbs:
                 output.append(sentence)
                 break
     return output
Example #29
0
def build_wiki_vocab(corpus):
    sentences = corpus

    tokenizer = wt()
    totalvocab = {}

    for s in sentences:
        strlist = tokenizer.tokenize(s.lower())
        for w in strlist:
            try:
                totalvocab[w] +=1
            except: 
               totalvocab[w] = 1

    sortvoc = sorted(totalvocab.iteritems(), key=operator.itemgetter(1))
    return sortvoc
Example #30
0
def build_wiki_vocab(corpus):
    sentences = corpus

    tokenizer = wt()
    totalvocab = {}

    for s in sentences:
        strlist = tokenizer.tokenize(s.lower())
        for w in strlist:
            try:
                totalvocab[w] += 1
            except:
                totalvocab[w] = 1

    sortvoc = sorted(totalvocab.iteritems(), key=operator.itemgetter(1))
    return sortvoc
Example #31
0
def addInfo(file, info, categories):
    '''NOT IN USE'''
    data = loadData(file)
    tokens = wt(categories)
    words = [w for w in tokens]
    print(words)
    #print(data)
    if len(words) == 1:
        data[words[0]] = info
    elif len(words) == 2:
        data[words[0]][words[1]] = info
    elif len(words) == 3:
        data[words[0]][words[1]][words[2]] = info
    elif len(words) == 1:
        data[words[0]][words[1]][words[2]][words[3]] = info
    elif len(words) > 4 or len(words) == 0:
        msg = "Categories aren't right"
        return msg
    print(data[words[0]][words[1]])
def clean_tokenize(sentence):
    """
	Tokenize a sentence (after removing stopwords and punctuation).
	
	Parameters
	----------
	sentence : string. The sentence from which we want to extract the keywords.
	
	Returns
	-------
	keywords : list of strings. The list of the non stop words.
	"""

    stop = set(stopwords.words("english"))
    keywords = [
        word.lower() for word in wt(re.sub("[^a-zA-Z]", " ", sentence))
        if word.lower() not in stop
    ]

    return keywords
def do_search():

    query = wt(input("Search query >> "))
    if query == []:
        sys.exit()
    # find document ids containing all query terms.  Works by
    # intersecting the posting lists for all query terms.
    relevant_document_ids = intersection(
        [set(postings[term].keys()) for term in query])
    list(relevant_document_ids)
    if not relevant_document_ids:
        print("No documents matched all query terms.")
    else:
        scores = sorted([(id, similarity(query, id))
                         for id in relevant_document_ids],
                        key=lambda x: x[1],
                        reverse=True)
        print("Score: filename")
        for (id, score) in scores:
            print(str(score) + ": " + id)
Example #34
0
def NgramWords(csvfile, n=2, minx=2, maxx=6):
    ngramlist = []
    bgramdict = defaultdict(int)
    dataframe = pandas.read_csv(csvfile, usecols=["Comment"])
    sents = dataframe.iloc[:, 0].tolist()
    for sent in sents:
        words = wt(sent)
        if len(words) > (
                n + 2):  #because there is a useless flag at the end of text
            for i in range((len(words) - n)):
                bword = ""
                for j in range(n):
                    bword += words[i + j]
                bgramdict[bword] += 1
    for key in bgramdict.keys():
        if bgramdict[key] > minx and bgramdict[key] < maxx:
            ngramlist.append(key)
    print("there are %d %d-gram words in ngramlist from %s." %
          (len(ngramlist), n, csvfile))
    #print(ngramlist[:10])
    return ngramlist
Example #35
0
def fristXpq(sDat, max_len, x=0):
    token = wt(sDat.lower())
    lt = []
    if len(sDat) < max_len:
        max_len = len(sDat)
    for i in range(1, max_len + 1):
        text = []
        if i > 1:
            text = textToWordList(token, i)
        else:
            text = token
        if x == 0:
            fdist = fd(text).most_common()
        else:
            fdist = fd(text).most_common(x)
        fdist = [[fdist[k][0], fdist[k][1]]
                 for tp, k in zip(fdist, range(len(fdist)))]
        #        sfdist=[[fdist[k][0],fdist[k][1]/len(text)] for tp,k in zip(fdist,range(len(fdist)))]
        lt.extend(fdist)
#        lt=fqtopq(lt,len(text))
    return (lt)
def trees_2_toks(sentences):
	"""
	From a list of chunked sentences
	returns the tokenized sentences
	without any POS or NER tag.
	"""
	
	sents = []

	for sent in sentences:
		# transforms in conll and delete the syntactic infos
		conll = nltk.chunk.tree2conllstr(sent)
		lines = conll.split('\n')
		s = ''
		for line in lines:
			line = line.split(' ')
			s = s + line[0] + ' '
		sents.append(wt(s[:-1]))

	# removes the last white character
	return sents
def sem_firstsense(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Get a list of synsets or terms, synsets for the terms whic have synsets, term for the ones don't
    Use first senses

    Return: synset_list (list of strings(terms that meets the POS criteria))
    """
    total_synset_sentence_list = []
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line 
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # list of first-sense synsets
        # t0 = time()
        synset_list = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in line_token if wn.synsets(x) ])
        # # First run uses about 2.5 s
        # print "Done in %fs" % (time() - t0)
        # format synset into term, e.g. Synset.share.v.1 -> sharev1
        synset_formatted_list = []
        for synset in synset_list:
            formatted_term = re.sub('[^A-Za-z0-9]+', '', str(synset))
            formatted_term = formatted_term.lstrip('Synset')
            synset_formatted_list.append(formatted_term)
        # list of terms without synset defination
        nonsynset_list = [ x for x in line_token if not wn.synsets(x)]
        # add synset list and nonsynset list together
        total_synset_list = synset_formatted_list + nonsynset_list
        # back to sentence as a string
        total_synset_sentence = ' '.join(total_synset_list)
        total_synset_sentence_list.append(total_synset_sentence)
    return total_synset_sentence_list
def extract_key_words(sentence, score_function, n, *args):
    """ 
	Function that extracts the most relevant keywords according to the scrabble_score.
	
	Parameters
	----------
	sentence       : string. In our case the question from which we want to extract the keywords.
	score_function : function. Function that computes the score given a word.
	*args          : arguments of the function. 
	n              : int. The number of keywords we want to extract (descending order).
	
	Returns
	-------
	keywords : list of strings. The list of the n most relevant keywords according the the score.
	"""

    words = wt(sentence)
    scores_words = {}
    for word in words:
        scores_words[word] = score_function(word, *args)
    keywords = sorted(scores_words, key=scores_words.get, reverse=True)[:n]
    return keywords
def q1():
	# 1. Print the number of word tokens
	# YOUR CODE
	from nltk.corpus import gutenberg as gb
	#if you want to print all file ids in gutenberg archive
	#print(gb.fileids())
	file_id = 'austen-sense.txt'
	word_list = gb.words(file_id)
	print(len(word_list))

	# 2. Print the number of word types
	# YOUR CODE
	print(len( set( [ w.lower() for w in word_list ]) ))
	
	# 3. Print all tokens in the first sentence
	# YOUR CODE
	sent_list = gb.sents(file_id)
	print(' '.join(sent_list[0]))

	# if you want to tokenize a string
	raw = 'i have a book.'
	from nltk import word_tokenize as wt 
	word_list = wt(raw)
def pos_bagging(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Use POS tags to replace all words

    Return: tagged_list (list of strings(terms that meets the POS criteria))
    """
    bagged_list = []
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
                
        # Having punctuation removal before POS seems to be a bad idea
        # # remove punctuation
        # # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # # line = ''.join([c for c in line 
        #                                     # if re.match("[a-z\-\' \n\t]", c)])
        # # this solve the problem above:
        # line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        
        # tokenize
        line_token = wt(line)
        # POS 
        pos_line = pos_tag(line_token)
        # filter line using POS info
        # only remain verbs, nouns, adverbs, adjectives
        bagged_line = []
        for tagged_tuple in pos_line:
            term = tagged_tuple[0]
            tag  = tagged_tuple[1]
            bagged_line.append(tag)
        # back to sentence as a string
        bagged_sentence = ' '.join(bagged_line)
        bagged_list.append(bagged_sentence)
    return bagged_list
def sim_overlap_phrasal(sentence1, sentence2): 
    # lowercase
    sentence1 = sentence1.lower()
    sentence2 = sentence2.lower()
    # remove punctuation
    nopunct_sentence1 = ''.join([c for c in sentence1 
                                        if re.match("[a-z\-\' \n\t]", c)])
    nopunct_sentence2 = ''.join([c for c in sentence2 
                                        if re.match("[a-z\-\' \n\t]", c)])                                         
    # tokenize
    line1 = wt(nopunct_sentence1)
    line2 = wt(nopunct_sentence2)

    # finders for bigram and trigram
    finder_bi_line1 = collocations.BigramCollocationFinder.from_words(line1)
    finder_bi_line2 = collocations.BigramCollocationFinder.from_words(line2)
    finder_tri_line1 = collocations.TrigramCollocationFinder.from_words(line1)
    finder_tri_line2 = collocations.TrigramCollocationFinder.from_words(line2)
    # find bigram / trigram
    scored_bi_line1 = finder_bi_line1.score_ngrams(bigram_measures.raw_freq)
    scored_bi_line2 = finder_bi_line2.score_ngrams(bigram_measures.raw_freq)
    scored_tri_line1 = finder_tri_line1.score_ngrams(bigram_measures.raw_freq)
    scored_tri_line2 = finder_tri_line2.score_ngrams(bigram_measures.raw_freq)
    # generate lists contain all the bigram or trigram for line1 and line2
    list_bi_line1 = sorted(bigram for bigram, score in scored_bi_line1)
    list_bi_line2 = sorted(bigram for bigram, score in scored_bi_line2)
    list_tri_line1 = sorted(trigram for trigram, score in scored_tri_line1)
    list_tri_line2 = sorted(trigram for trigram, score in scored_tri_line2)
    # find the common elements from two sets of bigram in two sentences
    common_set_bi = [i for i in list_bi_line1 if i in list_bi_line2]
    common_set_tri = [i for i in list_tri_line1 if i in list_tri_line2]

    # Calculate element numbers of intersection and sentence1
    # combined_line = line1 + line2
    # union_num = len(set(combined_line))
    intersection_len = len(set(line1) & set(line2))
    sentence1_len = len(set(line1))
    sentence2_len = len(set(line1))

    # Overlap (phrasal) score
    # Note, here we only consider trigram and bigram
    overlap_score = 9*len(common_set_tri) + 4*len(common_set_bi) + intersection_len

    # Normalization as defined in Ponzetto et al. 2007
    sim = float(overlap_score) / (sentence1_len+sentence2_len)
    sim = tanh(sim)

    return sim




# # Test
# list1 = load_sentences('data_not_sell')
# list2 = load_sentences('data_sell_share')

# sentence1 = list1[1]cd D
# sentence2 = list2[3]

# score = sim_overlap_phrasal(sentence1, sentence2)
# print score
Example #42
0
__author__ = "Soumik"
import nltk, re, pprint
from nltk import word_tokenize as wt
import codecs

f = open("pos.wn")
raw = f.read()
tokens = wt(raw)
text = nltk.Text(tokens)
g = []
for x in tokens:
    y = x.replace("_", " ")
    g.append(y)
print(g)
f1 = open("neg.wn")
raw1 = f1.read()
tokens1 = wt(raw1)
text1 = nltk.Text(tokens1)
h = []
for z in tokens1:
    a = z.replace("_", " ")
    h.append(a)
print("\n", h)

pcount = 0
ncount = 0

for x in h:
    if x in open("posTweets.txt", encoding="utf8").read():
        pcount = pcount + 1
print("The number of positive tweets are:%s", pcount)
def main(f_1, f_2):

	"""
	Extracts basic stats from 2 text files
	"""

	######################################
	#	Text extraction
	######################################

	t_1 = ''
	t_2 = ''


	with open(f_1, 'r') as f:
		t_1 = f.read()
	with open(f_2, 'r') as f:
		t_2 = f.read()

	######################################
	#	Text cleaning
	######################################

	t_1 = cl.clean_text(t_1)
	t_2 = cl.clean_text(t_2)

	######################################
	#	Extracting informations
	######################################

	# Sentence tokens
	sents_1 = st.tokenize(t_1)
	sents_2 = st.tokenize(t_2)
	
	# Word tokens (by sentence)
	s_toks_1 = []
	s_toks_2 = []

	for s_1, s_2 in zip(sents_1, sents_2):
		s_toks_1.append(wt(s_1))
		s_toks_2.append(wt(s_2))

	# Word tokens (unique list)
	toks_1 = wt(t_1)
	toks_2 = wt(t_2)



	# Initialize output
	output = OUT.init_html()
	soup = BeautifulSoup(output, 'html.parser')

	col_1 = soup.find('div', {'id': 'text-1-container'})
	col_2 = soup.find('div', {'id': 'text-2-container'})

	title_1 = soup.new_tag('p', attrs={'id' : 'title-1'})
	title_1.insert(1,'Alice in Wonderland')

	title_2 = soup.new_tag('p', attrs={'id' : 'title-2'})
	title_2.insert(1,'Second Variety')

	author_1 = soup.new_tag('p', attrs={'id' : 'author-1'})
	author_1.insert(1,'Lewis Carroll')

	author_2 = soup.new_tag('p', attrs={'id' : 'author-2'})
	author_2.insert(1,'Philip Kindred Dick')

	col_1.append(title_1)
	col_1.append(author_1)

	col_2.append(title_2)
	col_2.append(author_2)

	# Print number of sentences
	nc_1 = OUT.create_container('Number of sentences', str(len(sents_1)), soup)
	nc_2 = OUT.create_container('Number of sentences', str(len(sents_2)), soup)
	col_1.append(nc_1)
	col_2.append(nc_2)
	print('added Number of sentences to project/index.html')

	# Print number of tokens
	nc_1 = OUT.create_container('Number of tokens', str(len(toks_1)), soup)
	nc_2 = OUT.create_container('Number of tokens', str(len(toks_2)), soup)
	col_1.append(nc_1)
	col_2.append(nc_2)
	print('added Number of tokens to project/index.html')

	# Print mean sentence length
	nc_1 = OUT.create_container('Mean sentence length', str(tool.minify(LC.mean_len(s_toks_1))), soup)
	nc_2 = OUT.create_container('Mean sentence length', str(tool.minify(LC.mean_len(s_toks_2))), soup)
	col_1.append(nc_1)
	col_2.append(nc_2)
	print('added Mean sentence length to project/index.html')

	# Print word length
	nc_1 = OUT.create_container('Mean word length', str(tool.minify(LC.mean_len(toks_1))), soup)
	nc_2 = OUT.create_container('Mean word length', str(tool.minify(LC.mean_len(toks_2))), soup)
	col_1.append(nc_1)
	col_2.append(nc_2)
	print('added Mean word length to project/index.html')



	# Vocab
	voc_1 = LC.create_vocab(toks_1)
	voc_2 = LC.create_vocab(toks_2)

	# Print vocabulary dimension

	nc_1 = OUT.create_container('Vocabulary dimension', str(len(voc_1)), soup)
	nc_2 = OUT.create_container('Vocabulary dimension', str(len(voc_2)), soup)
	col_1.append(nc_1)
	col_2.append(nc_2)
	print('added Vocabulary dimension to project/index.html')


	# Frequencies
	freq_1 = LC.freqs(toks_1, voc_1)
	freq_2 = LC.freqs(toks_2, voc_2)

	# Hapax_list
	hapax_1 = LC.get_hapax(freq_1)
	hapax_2 = LC.get_hapax(freq_2)

	# Print number of hapaxes
	nc_1 = OUT.create_container('Number of hapaxes', str(len(hapax_1)), soup)
	nc_2 = OUT.create_container('Number of hapaxes', str(len(hapax_2)), soup)
	col_1.append(nc_1)
	col_2.append(nc_2)
	print('added Number of hapaxes to project/index.html')

	print('PLOTTING HAPAXES - DISTRIBUTION...')

	# Plotting Hapaxes' distribution 
	LC.hapax_distr(toks_1, 1000, "Alice in Wonderland - Carrol")
	LC.hapax_distr(toks_2, 1000, "Second Variety - Dick")


	# Print plots
	nc_1 = OUT.create_img_container('Hapax distribution', '../plots/Alice in Wonderland - Carrol.svg', soup)
	nc_2 = OUT.create_img_container('Hapax distribution', '../plots/Second Variety - Dick.svg', soup)
	col_1.append(nc_1)
	col_2.append(nc_2)
	print('added Hapax distribution to project/index.html')

	# POS tagging
	POS_1 = POS(toks_1)
	POS_2 = POS(toks_2)

	# POS frequency
	"""
	Getting the frequency of every POS in the two texts
	"""

	POS_freqs_1 = LC.get_POS_freqs(POS_1)
	POS_freqs_2 = LC.get_POS_freqs(POS_2)


	NN_1 = POS_freqs_1['NN'] + POS_freqs_1['NNS'] + POS_freqs_1['NNP'] + POS_freqs_1['NNPS']
	VB_1 = POS_freqs_1['VB'] + POS_freqs_1['VBD'] + POS_freqs_1['VBG'] + POS_freqs_1['VBN'] + POS_freqs_1['VBP'] + POS_freqs_1['VBZ']

	NN_2 = POS_freqs_2['NN'] + POS_freqs_2['NNS'] + POS_freqs_2['NNP'] + POS_freqs_2['NNPS']
	VB_2 = POS_freqs_2['VB'] + POS_freqs_2['VBD'] + POS_freqs_2['VBG'] + POS_freqs_2['VBN'] + POS_freqs_2['VBP'] + POS_freqs_2['VBZ']

	# Print Nouns / Verbs
	nc_1 = OUT.create_container('Nouns / Verbs', str(tool.minify(NN_1 / VB_1)), soup)
	nc_2 = OUT.create_container('Nouns / Verbs', str(tool.minify(NN_2 / VB_2)), soup)
	col_1.append(nc_1)
	col_2.append(nc_2)
	print('added Nouns / Verbs to project/index.html')



	POS_rank_1 = LC.rank(POS_freqs_1)
	POS_rank_2 = LC.rank(POS_freqs_2)

	# Print most frequent POS
	nc_1 = OUT.complex_container('Most frequent POS', list(POS_rank_1)[:10], soup)
	nc_2 = OUT.complex_container('Most frequent POS', list(POS_rank_2)[:10], soup)
	col_1.append(nc_1)
	col_2.append(nc_2)
	print('added Most frequent POS to project/index.html')



	# POS bigrams
	"""
	Getting the frequency of every POS-couple in the two texts
	"""
	POS_bi_1 = LC.tag_bigrams(POS_1)
	POS_bi_2 = LC.tag_bigrams(POS_2)

	POS_bi_set_1 = list(set(POS_bi_1))
	POS_bi_set_2 = list(set(POS_bi_2))


	POS_bi_freq_1 = LC.freqs(POS_bi_1, POS_bi_set_1)		
	POS_bi_freq_2 = LC.freqs(POS_bi_2, POS_bi_set_2)

	# POS Conditioned probabilities
	"""
	Now that we have the frequencies of every POS 
	and every couple of POS, we can compute 
	the conditioned probability for each couple.
	"""

	POS_cond_prob_1 = LC.get_cond_prob(POS_bi_freq_1, POS_freqs_1)
	POS_cond_prob_2 = LC.get_cond_prob(POS_bi_freq_2, POS_freqs_2)


	# Print conditioned-probability
	nc_1 = OUT.more_complex_container('Top conditioned-probable POS', LC.rank(POS_cond_prob_1)[:10], soup, ['', 'bigrams', 'probability'])
	nc_2 = OUT.more_complex_container('Top conditioned-probable POS', LC.rank(POS_cond_prob_2)[:10], soup, ['', 'bigrams', 'probability'])
	col_1.append(nc_1)
	col_2.append(nc_2)
	print('added Top conditioned-probable POS to project/index.html')



	# POS Local Mutual Information

	POS_bi_LMI_1 = LC.get_LMI(POS_bi_freq_1, len(POS_bi_1), POS_freqs_1, len(POS_1), POS_bi_set_1)
	POS_bi_LMI_2 = LC.get_LMI(POS_bi_freq_2, len(POS_bi_2), POS_freqs_2, len(POS_2), POS_bi_set_2)

	ranked_POS_bi_LMI_1 = LC.rank(POS_bi_LMI_1)
	ranked_POS_bi_LMI_2 = LC.rank(POS_bi_LMI_2)

	# Print related bigrams
	nc_1 = OUT.more_complex_container('Top LMI-related bigrams', ranked_POS_bi_LMI_1[:10], soup, ['', 'bigrams', 'LMI'])
	nc_2 = OUT.more_complex_container('Top LMI-related bigrams', ranked_POS_bi_LMI_2[:10], soup, ['', 'bigrams', 'LMI'])
	col_1.append(nc_1)
	col_2.append(nc_2)
	print('added Top LMI-related bigrams to project/index.html')

	# Final prints
	title_1 = soup.new_tag('p', attrs={'id' : 'book-1'})
	title_1.insert(1,'Alice in Wonderland')
	soup.body.append(title_1)

	title_2 = soup.new_tag('p', attrs={'id' : 'book-2'})
	title_2.insert(1,'Second Variety')
	soup.body.append(title_2)

	# Exporting output
	with open('output/index_1.html', 'w') as h:
		h.write(str(soup))
def sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic, alpha=0.05):
    # Bug fix: lower
    sentence1 = sentence1.lower()
    sentence2 = sentence2.lower()
    # import stopwords 
    sw = stopwords.words('english')
    # remove punctuation
    nopunct_sentence1 = ''.join([c for c in sentence1 
                                        if re.match("[a-z\-\' \n\t]", c)])
    nopunct_sentence2 = ''.join([c for c in sentence2 
                                        if re.match("[a-z\-\' \n\t]", c)])                                         
    # tokenize
    line1 = wt(nopunct_sentence1)
    line2 = wt(nopunct_sentence2)

    # get list of synsets only using first senses, without stopword elimination
    synset_list1 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in line1 if wn.synsets(x) ])
    synset_list2 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in line2 if wn.synsets(x) ])
    # # get the synset list for each sentence, containing all WordNet senses
    # # with stopword elimination
    # synset_list1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in line1 if x not in sw])
    # synset_list2 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in line2 if x not in sw])
    # # get list of synsets only using first senses, with stopword elimination
    # synset_list1 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in line1 if ((x not in sw) and wn.synsets(x)) ])
    # synset_list2 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in line2 if ((x not in sw) and wn.synsets(x)) ])

    mark_list = []
    # get Wordnet similarity score for <metric> for each pair created from both synset lists 
    for synset1 in set(synset_list1): 
        for synset2 in set(synset_list2): 
            if ic is not None:
                try:
                    mark = metric(synset1, synset2, ic)
                    if mark is None:
                        mark = 0.0
                except: 
                    mark = 0.0
                # handle infinitity mark for jcn measure
                if mark == 1e+300:
                    mark = 1.0
                mark_list.append(mark)
            else: 
                try:
                    mark = metric(synset1, synset2)
                    if mark is None:
                        mark = 0.0
                except:
                    mark = 0.0
                mark_list.append(mark)

    # sort mark_list to be from highest to lowest
    mark_list.sort()
    mark_list.reverse()

    # calculate threshold given alpha and length of mark_list
    threshold = alpha * len(mark_list)
    threshold = int(round(threshold))
    # build the top alpha list of marks
    top_alpha_mark_list = mark_list[0:threshold]

    # add up individual scores, divide by number of individual scores
    sim = sum(top_alpha_mark_list) / len(top_alpha_mark_list)
    return sim


# # Test
# list1 = load_sentences('data_not_sell')
# list2 = load_sentences('data_sell_share')

# sentence1 = list1[0]
# sentence2 = list2[1]

# brown_ic = wordnet_ic.ic('ic-brown.dat')
# semcor_ic = wordnet_ic.ic('ic-semcor.dat')

# # sim_sem_firstsense_alpha(sentence1, sentence2)

# score = sim_sem_firstsense_alpha(sentence1, sentence2)
# print 'path: ', score
# score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.lch_similarity)
# print 'lch : ', score
# score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.wup_similarity)
# print 'wup : ', score
# score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.res_similarity, ic=brown_ic)
# print 'res - brown  : ', score
# score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.res_similarity, ic=semcor_ic)
# print 'res - semcor : ', score
# score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic)
# print 'jcn : ', score
# score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.lin_similarity, ic=brown_ic)
# print 'lin : ', score

# Sample results:
# sentence1 = list1[0]
# sentence2 = list2[1]
# alpha = 0.2
# path:  0.255693843194
# lch :  1.70093033207
# wup :  0.468924493692
# res - brown  :  2.33328289008
# res - semcor :  2.18274083157
# jcn :  0.2375842434
# lin :  0.273913605124
Example #45
0
def pos_negation_bigram(line_list):
    """
    Input: 
        line_list (list of strings(sentences/documents)) - e.g. dataset.data

    POS tag the line, match patterns of negations to form bigram terms

    Return: pos_neg_bigram_list (list of strings(terms that meets the POS criteria))
    """
    neg_verb_set = ['not', 'never', 'neither']
    neg_noun_set = ['without']
    verb_window = 10
    noun_window = 3

    pos_neg_bigram_list = []
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # tokenize
        line_token = wt(line)
        # base for return
        pos_neg_bigram_line = []
        # POS 
        pos_line = pos_tag(line_token)

        # =========
        # POS part
        for tagged_tuple in pos_line:
            term = tagged_tuple[0]
            tag  = tagged_tuple[1]
            pos_neg_bigram_line.append(term+tag)
        # back to sentence as a string
        
        # =========
        # Then negation bigram construction part
        # first iteration to find flag words
        neg_verb = None
        neg_verb_flag = None
        neg_noun_flag = None
        for i, tagged_tuple in enumerate(pos_line):
            term = tagged_tuple[0]
            if term in neg_verb_set:
                neg_verb_flag = i
                neg_verb = term
            elif term in neg_noun_set:
                neg_noun_flag = i

        # second iteration to find neg_verb match and form bigram
        if neg_verb_flag != None:
            for i, tagged_tuple in enumerate(pos_line):
                term = tagged_tuple[0]
                tag  = tagged_tuple[1]
                if (i-neg_verb_flag)<=verb_window and (i-neg_verb_flag)>0 and tag.startswith('V'):
                    pos_neg_bigram_line.append(neg_verb+term)

        # third iteration to find neg_noun match and form bigram
        if neg_noun_flag != None:
            for i, tagged_tuple in enumerate(pos_line):
                term = tagged_tuple[0]
                tag  = tagged_tuple[1]
                if (i-neg_noun_flag)<=noun_window and (i-neg_noun_flag)>0 and tag.startswith('N'):
                    pos_neg_bigram_line.append("without"+term)

        # back to sentence as a string
        neg_bigram_sentence = ' '.join(pos_neg_bigram_line)
        pos_neg_bigram_list.append(neg_bigram_sentence)
    return pos_neg_bigram_list
def main(f_1, f_2):
    """
	Extracts NER stats from 2 text files
	"""

    ######################################
    #	Text extraction
    ######################################

    t_1 = ''
    t_2 = ''

    with open(f_1, 'r') as f:
        t_1 = f.read()
    with open(f_2, 'r') as f:
        t_2 = f.read()

    ######################################
    #	Text cleaning
    ######################################

    t_1 = cl.clean_text(t_1)
    t_2 = cl.clean_text(t_2)

    # Initialize output
    output = OUT.init_html()
    soup = BeautifulSoup(output, 'html.parser')

    col_1 = soup.find('div', {'id': 'text-1-container'})
    col_2 = soup.find('div', {'id': 'text-2-container'})

    title_1 = soup.new_tag('p', attrs={'id': 'title-1'})
    title_1.insert(1, 'Alice in Wonderland')

    title_2 = soup.new_tag('p', attrs={'id': 'title-2'})
    title_2.insert(1, 'Second Variety')

    author_1 = soup.new_tag('p', attrs={'id': 'author-1'})
    author_1.insert(1, 'Lewis Carroll')

    author_2 = soup.new_tag('p', attrs={'id': 'author-2'})
    author_2.insert(1, 'Philip Kindred Dick')

    col_1.append(title_1)
    col_1.append(author_1)

    col_2.append(title_2)
    col_2.append(author_2)

    ######################################
    #	Extracting the useful elements
    ######################################

    # Sentence tokens
    sents_1 = st.tokenize(t_1)
    sents_2 = st.tokenize(t_2)

    # Word tokens (by sentence)
    s_toks_1 = []
    s_toks_2 = []

    for s_1, s_2 in zip(sents_1, sents_2):
        s_toks_1.append(wt(s_1))
        s_toks_2.append(wt(s_2))

    # Word tokens (unique list)
    toks_1 = wt(t_1)
    toks_2 = wt(t_2)

    # NER tagging

    POS_NE_1 = NE.ner_sents(sents_1)
    POS_NE_2 = NE.ner_sents(sents_2)

    PERSON_sents_1 = NE.get_ner_sents(POS_NE_1, ['PERSON'])
    PERSON_sents_2 = NE.get_ner_sents(POS_NE_2, ['PERSON'])

    # Get the entities of specific NER_tag
    ENTITIES_1 = NE.get_ner_entities(PERSON_sents_1, ['PERSON'])
    ENTITIES_2 = NE.get_ner_entities(PERSON_sents_2, ['PERSON'])

    PEOPLE_1 = LC.rank(ENTITIES_1['PERSON'])[:10]
    PEOPLE_2 = LC.rank(ENTITIES_2['PERSON'])[:10]

    # Print most frequent characters
    nc_1 = OUT.complex_container('Most frequent characters', PEOPLE_1, soup)
    nc_2 = OUT.complex_container('Most frequent characters', PEOPLE_2, soup)
    col_1.append(nc_1)
    col_2.append(nc_2)
    print('added Most frequent characters to project/index.html')

    # Extract only sentence in which appear the selected PEOPLE
    PEOPLE_1 = tool.tup_2_list(PEOPLE_1)
    PEOPLE_2 = tool.tup_2_list(PEOPLE_2)

    useful_sents_1 = NE.meaningful_sents(PERSON_sents_1, ['PERSON'], PEOPLE_1)
    useful_sents_2 = NE.meaningful_sents(PERSON_sents_2, ['PERSON'], PEOPLE_2)

    ######################################
    #	Extracting informations
    ######################################
    """
	Now that we have extracted the useful sentences,
	we can begin to mine the information inside them.
	"""

    # NER_tags to look for in the sentence
    NER_tags = ['PERSON', 'LOC', 'GPE', 'DATE', 'TIME']
    POS_tags = [
        'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
    ]

    # For every sent: find the Person and {PERSON_1 : [link1: freq, link2: freq], PERSON_2: [link1: freq, link2: freq]}
    links_1 = NE.extract_links(useful_sents_1, {'PERSON': PEOPLE_1}, NER_tags,
                               POS_tags)
    links_2 = NE.extract_links(useful_sents_2, {'PERSON': PEOPLE_2}, NER_tags,
                               POS_tags)

    # Extract the frequency of each link
    links_freqs_1 = NE.extract_link_freqs(links_1)
    links_freqs_2 = NE.extract_link_freqs(links_2)

    #with open('tests/links_freqs_1.txt', 'w') as output:
    #output.write(str(links_freqs_1))

    # Extract the most frequent items for each field
    most_freq_1 = NE.rank_link_freqs(links_freqs_1, 10)
    most_freq_2 = NE.rank_link_freqs(links_freqs_2, 10)

    # Print infos TEXT_1
    for NER in most_freq_1:
        for leaf in most_freq_1[NER]:
            for link_type in most_freq_1[NER][leaf]:
                nc_1 = OUT.complex_container(leaf + ' + ' + link_type,
                                             most_freq_1[NER][leaf][link_type],
                                             soup)
                col_1.append(nc_1)
                print('added', leaf, ' + ', link_type, 'to project/index.html')

    # Print infos TEXT_2
    for NER in most_freq_2:
        for leaf in most_freq_2[NER]:
            for link_type in most_freq_2[NER][leaf]:
                nc_2 = OUT.complex_container(leaf + ' + ' + link_type,
                                             most_freq_2[NER][leaf][link_type],
                                             soup)
                col_2.append(nc_2)
                print('added', leaf, ' + ', link_type, 'to project/index.html')

    # Get the max-prob sentence of lenght between 8 and 12 token

    # Select sents of lenght between 8 and 12 token containing
    selected_sents_1 = tool.sents_of_len(NE.trees_2_toks(useful_sents_1), 8,
                                         12)
    selected_sents_2 = tool.sents_of_len(NE.trees_2_toks(useful_sents_2), 8,
                                         12)

    sents_4_person_1 = NE.assign_sent_2_person(selected_sents_1, PEOPLE_1)
    sents_4_person_2 = NE.assign_sent_2_person(selected_sents_2, PEOPLE_2)

    # Compute the probability for each sentence for each important person in the text
    max_markow_1 = {}
    for person in sents_4_person_1:
        max_markow_1[person] = NE.get_max_markow(sents_4_person_1[person],
                                                 fd(toks_1), len(t_1), False)

    max_markow_2 = {}
    for person in sents_4_person_2:
        max_markow_2[person] = NE.get_max_markow(sents_4_person_2[person],
                                                 fd(toks_2), len(t_2), False)

    # Print Markov probabilities
    for person in max_markow_1:
        nc_1 = OUT.tok_sent_container('Probable sentence for ' + person,
                                      max_markow_1[person], soup)
        col_1.append(nc_1)
        print('added ' + 'Probable sentence for ' + person +
              'to project/index.html')

    for person in max_markow_2:
        nc_2 = OUT.tok_sent_container('Probable sentence for ' + person,
                                      max_markow_2[person], soup)
        col_2.append(nc_2)
        print('added ' + 'Probable sentence for ' + person +
              'to project/index.html')

    # Final prints
    title_1 = soup.new_tag('p', attrs={'id': 'book-1'})
    title_1.insert(1, 'Alice in Wonderland')
    soup.body.append(title_1)

    title_2 = soup.new_tag('p', attrs={'id': 'book-2'})
    title_2.insert(1, 'Second Variety')
    soup.body.append(title_2)

    # Exporting output
    with open('output/index_2.html', 'w') as h:
        h.write(str(soup))
def sim_sem_firstsense_pos(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic):
    # def sim_sem_firstsense_pos(sentence1, sentence2, metric=wn.path_similarity, ic=None):

    # Bug fix: lower
    sentence1 = sentence1.lower()
    sentence2 = sentence2.lower()
    # import stopwords
    sw = stopwords.words("english")
    # remove punctuation
    nopunct_sentence1 = "".join([c for c in sentence1 if re.match("[a-z\-' \n\t]", c)])
    nopunct_sentence2 = "".join([c for c in sentence2 if re.match("[a-z\-' \n\t]", c)])
    # tokenize
    line1 = wt(nopunct_sentence1)
    line2 = wt(nopunct_sentence2)
    # POS
    pos_line1 = pos_tag(line1)
    pos_line2 = pos_tag(line2)

    # filter line1 and line2 using POS info
    # only remain verbs, nouns, adverbs, adjectives
    filtered_line1 = []
    filtered_line2 = []
    for tagged_tuple in pos_line1:
        term = tagged_tuple[0]
        tag = tagged_tuple[1]
        # find out all verbs, nouns, adverbs, adjectives
        if tag.startswith("V") or tag.startswith("N") or tag.startswith("R") or tag.startswith("J"):
            filtered_line1.append(term)
    for tagged_tuple in pos_line2:
        term = tagged_tuple[0]
        tag = tagged_tuple[1]
        # find out all verbs, nouns, adverbs, adjectives
        if tag.startswith("V") or tag.startswith("N") or tag.startswith("R") or tag.startswith("J"):
            filtered_line2.append(term)

    # get list of synsets only using first senses, without stopword elimination
    # synset_list1 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in filtered_line1 if wn.synsets(x) ])
    # synset_list2 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in filtered_line2 if wn.synsets(x) ])
    # # get the synset list for each sentence, containing all WordNet senses
    # # with stopword elimination
    # synset_list1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in line1 if x not in sw])
    # synset_list2 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in line2 if x not in sw])
    # # get list of synsets only using first senses, with stopword elimination
    synset_list1 = reduce(lambda x, y: x + y, [[wn.synsets(x)[0]] for x in line1 if ((x not in sw) and wn.synsets(x))])
    synset_list2 = reduce(lambda x, y: x + y, [[wn.synsets(x)[0]] for x in line2 if ((x not in sw) and wn.synsets(x))])

    runningscore = 0.0
    runningcount = 0
    # get Wordnet similarity score for <metric> for each pair created from both synset lists
    for synset1 in set(synset_list1):
        for synset2 in set(synset_list2):
            if ic is not None:
                try:
                    mark = metric(synset1, synset2, ic)
                    if mark is None:
                        mark = 0.0
                except:
                    mark = 0.0
                # handle infinitity mark for jcn measure
                if mark == 1e300:
                    mark = 1.0
                runningscore += mark
            else:
                try:
                    mark = metric(synset1, synset2)
                    if mark is None:
                        mark = 0.0
                except:
                    mark = 0.0
                runningscore += mark
            runningcount += 1

    # add up individual scores, divide by number of individual scores
    sim = runningscore / runningcount
    return sim
Example #48
0
import nltk
import sklearn
from nltk import word_tokenize as wt

# --------- optional start ------------ #
f = open('NEWS.txt', encoding='utf-8')
raw = f.read()

# WORD TOKENIZER
tokens = wt(raw)

# CREATE NLTK TEXT
txt = nltk.Text(tokens)
# ----------optional end -------------- #

# CLASSIFIER
from nltk.corpus import names


# SINGLE FEATURE GENERATOR & CLASSIFICATION #
# ------------------- start --------------- #
def feature_generator(word):
    return {'last_letter': word[-1]}


print(feature_generator('praveen'))

import random

names = ([(name, 'male') for name in names.words('male.txt')] + \
   [(name, 'female') for name in names.words('female.txt')])
Example #49
0
def preprocess_marco(input_file="../data/dev_v1.1.json.gz",
                     vocab_file="../data/vocab_marco.json",
                     data_file="../data/marco_dev.txt",
                     max_sent_len=20,
                     max_doc_len=100,
                     vocab_size=10000):
    lines = gzip.open(input_file, 'r').readlines()
    vocab_writer = codecs.open(vocab_file, 'w', "utf-8")
    data_writer = codecs.open(data_file, "w", "utf-8")
    vocab = {}
    for line in lines:
        content = json.loads(line.decode("utf-8"))

        query = content["query"]
        passages = content["passages"]
        answers = content["answers"]

        query_token = wt(query.lower())
        query_token = list(map(lambda t: t.lower(), query_token))
        for token in query_token:
            if token not in vocab:
                vocab[token] = 0
            vocab[token] += 1
        if len(answers) == 1:
            data_writer.write(' '.join(query_token[:max_sent_len]) + '\t')

        for passage in passages:
            tokens = wt(passage["passage_text"].lower())

            for token in tokens:
                if token not in vocab:
                    vocab[token] = 0
                vocab[token] += 1
            if passage["is_selected"] and len(answers) == 1:
                data_writer.write(' '.join(tokens[:max_doc_len]) + '\t')

        for answer in answers:
            answer_token = wt(answer.lower())

            for token in answer_token:
                if token not in vocab:
                    vocab[token] = 0
                vocab[token] += 1
            if len(answers) == 1:
                data_writer.write(' '.join(answer_token[:max_sent_len]) + '\n')

    sorted_count = sorted(vocab.items(), key=lambda t: t[1], reverse=True)
    sorted_count = list(map(lambda t: t[0], sorted_count[:vocab_size - 1]))
    json_out = {
        "index_to_token": {i + 1: t
                           for i, t in enumerate(sorted_count)},
        "token_to_index": {t: i + 1
                           for i, t in enumerate(sorted_count)}
    }

    json_out["index_to_token"][vocab_size] = "UNK"
    json_out["token_to_index"]["UNK"] = vocab_size

    json.dump(json_out, vocab_writer)
    vocab_writer.flush()
    data_writer.flush()
    vocab_writer.close()
    data_writer.close()
Example #50
0
def ExtractFeature1(processedfile,
                    wordlist1=False,
                    wordlist2=False,
                    wordlist3=False):
    """
    processedfile : csvfile, (output of 'PreprocessCSV')
    wordlist1/2/3 : list
    """
    features = []
    dataframe = pandas.read_csv(processedfile, usecols=["Insult", "Comment"])
    labels = dataframe.iloc[:, 0].tolist()
    sents = dataframe.iloc[:, 1].tolist()

    # unigram feature
    for sent in sents:
        #every sent generates a feature vector
        words = wt(sent)
        sent_fea = []
        #first part feature
        if wordlist1:
            cur_fea1 = [0] * len(wordlist1)
            for word in words:
                if word in wordlist1:
                    fea_ind = wordlist1.index(word)
                    cur_fea1[fea_ind] += 1
                else:
                    pass
            sent_fea += cur_fea1
            cur_fea2 = [0]
            for word in words:
                if word in BW:
                    cur_fea2[0] += 1
                    if word.isupper():
                        cur_fea2[0] += 1
            sent_fea += cur_fea2
            cur_fea3 = [0]
            if cur_fea2[0] == 1:
                if "you" in sent:
                    cur_fea3[0] += 1
            sent_fea += cur_fea3
        # bigram feature
        if wordlist2:
            cur_bgram = [0] * (len(wordlist2))
            for word in words:
                if len(words) > 2:
                    for i in range((len(words) - 1)):
                        bword = words[i] + words[i + 1]
                        if bword in wordlist2:
                            fea_ind = wordlist2.index(bword)
                            cur_bgram[fea_ind] += 1
                        else:
                            pass
            sent_fea += cur_bgram
        # trigram feature
        if wordlist3:
            cur_trigram = [0] * (len(wordlist3))
            for word in words:
                if len(words) > 3:
                    for i in range((len(words) - 2)):
                        tword = ""
                        for j in range(3):
                            tword += words[i + j]
                        if tword in wordlist3:
                            fea_ind = wordlist3.index(tword)
                            cur_trigram[fea_ind] += 1
                        else:
                            pass
            sent_fea += cur_trigram
        features.append(sent_fea)
    print("labels and features are extracted from file %s." % processedfile)
    return labels, features
def sim_sem_intermax(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic):

    # Bug fix: lower
    sentence1 = sentence1.lower()
    sentence2 = sentence2.lower()
    # import stopwords 
    sw = stopwords.words('english')
    # remove punctuation
    nopunct_sentence1 = ''.join([c for c in sentence1 
                                        if re.match("[a-z\-\' \n\t]", c)])
    nopunct_sentence2 = ''.join([c for c in sentence2 
                                        if re.match("[a-z\-\' \n\t]", c)])                                         
    # tokenize
    line1 = wt(nopunct_sentence1)
    line2 = wt(nopunct_sentence2)
    # POS 
    pos_line1 = pos_tag(line1)
    pos_line2 = pos_tag(line2)

    # filter line1 and line2 using POS info
    # only remain verbs, nouns, adverbs, adjectives
    filtered_line1 = []
    filtered_line2 = []
    for tagged_tuple in pos_line1:
        term = tagged_tuple[0]
        tag  = tagged_tuple[1]
        # find out all verbs, nouns, adverbs, adjectives
        # in the meanwhile get rid of terms that do not appear in WordNet
        if (tag.startswith('V') or tag.startswith('N') or tag.startswith('R') or tag.startswith('J')) and wn.synsets(term):
            filtered_line1.append(term)
    for tagged_tuple in pos_line2:
        term = tagged_tuple[0]
        tag  = tagged_tuple[1]
        # find out all verbs, nouns, adverbs, adjectives
        # in the meanwhile get rid of terms that do not appear in WordNet
        if (tag.startswith('V') or tag.startswith('N') or tag.startswith('R') or tag.startswith('J')) and wn.synsets(term):
            filtered_line2.append(term)

    # get the synset list for each sentence, containing all WordNet senses
    # without stopword elimination
    synset_list1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in filtered_line1])
    synset_list2 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in filtered_line2])
    # # # with stopword elimination
    # # synset_list1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in filtered_line1 if x not in sw])
    # # synset_list2 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in filtered_line2 if x not in sw])

    # get max score lists using the inter max function defined above
    max_score_list1 = inter_sentence_max(filtered_line1, synset_list2, metric=metric, ic=ic)
    max_score_list2 = inter_sentence_max(filtered_line2, synset_list1, metric=metric, ic=ic)

    sim = (sum(max_score_list1) + sum(max_score_list2)) / (len(max_score_list1) + len(max_score_list2))
    return sim


# # Test
# list1 = load_sentences('data_not_sell')
# list2 = load_sentences('data_sell_share')

# sentence1 = list1[0]
# sentence2 = list2[1]

# brown_ic = wordnet_ic.ic('ic-brown.dat')
# semcor_ic = wordnet_ic.ic('ic-semcor.dat')

# # sim_sem_intermax(sentence1, sentence2)

# score = sim_sem_intermax(sentence1, sentence2)
# print 'path: ', score
# score = sim_sem_intermax(sentence1, sentence2, metric=wn.lch_similarity)
# print 'lch : ', score
# score = sim_sem_intermax(sentence1, sentence2, metric=wn.wup_similarity)
# print 'wup : ', score
# score = sim_sem_intermax(sentence1, sentence2, metric=wn.res_similarity, ic=brown_ic)
# print 'res - brown  : ', score
# score = sim_sem_intermax(sentence1, sentence2, metric=wn.res_similarity, ic=semcor_ic)
# print 'res - semcor : ', score
# score = sim_sem_intermax(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic)
# print 'jcn : ', score
# score = sim_sem_intermax(sentence1, sentence2, metric=wn.lin_similarity, ic=brown_ic)
# print 'lin : ', score

# Sample results:
# sentence1 = list1[0]
# sentence2 = list2[1]
# path:  0.511742424242
# lch :  2.37924751823
# wup :  0.715648844878
# res - brown  :  5.9252699315
# res - semcor :  6.82379313536
# jcn :  0.693656881745
# lin :  0.662626674403
def sim_wordorder(sentence1, sentence2, threshold=0.3, metric=wn.path_similarity, ic=None):
    # lowercase
    sentence1 = sentence1.lower()
    sentence2 = sentence2.lower()
    # remove punctuation
    nopunct_sentence1 = ''.join([c for c in sentence1
                                        if re.match("[a-z\-\' \n\t]", c)])
    nopunct_sentence2 = ''.join([c for c in sentence2 
                                        if re.match("[a-z\-\' \n\t]", c)])
    # tokenize
    line1 = wt(nopunct_sentence1)
    line2 = wt(nopunct_sentence2)
    
    # joint list
    # # Note: set() method is not inplace, 
    # #       however, the calculate of word order vector does not 
    # #       require inplace set of J due to nature of vector modulus
    # J = list(set(line1).union(set(line2)))
    # print J
    # an inplace way to get the joint set:
    combined = line1 + line2
    J = []
    [J.append(x) for x in combined if x not in J]

    r1 = calculate_word_order_vector(J, line1, threshold, metric, ic)
    r2 = calculate_word_order_vector(J, line2, threshold, metric, ic)

    # Similarity calculation given word order vector r1 and r2
    # transfer to array
    x = np.array(r1)
    y = np.array(r2)
    # difference and sum
    diff = x - y
    summ = x + y
    # modulus
    diff_modulus = np.sqrt((diff*diff).sum())
    summ_modulus = np.sqrt((summ*summ).sum())
    # final similarity
    sim = 1 - (diff_modulus/summ_modulus)

    return sim


# # Test
# # T1 = 'A quick brown dog jumps over the lazy fox.'
# # T2 = 'A quick blue fox jumps over the lazy dog.'
# # score = sim_wordorder(T1,T2)
# # print score

# # Test
# list1 = load_sentences('data_not_sell')
# list2 = load_sentences('data_sell_share')

# sentence1 = list1[0]
# sentence2 = list2[1]

# brown_ic = wordnet_ic.ic('ic-brown.dat')
# semcor_ic = wordnet_ic.ic('ic-semcor.dat')

# # sim_wordorder(sentence1, sentence2)

# score = sim_wordorder(sentence1, sentence2)
# print 'path: ', score
# score = sim_wordorder(sentence1, sentence2, metric=wn.lch_similarity)
# print 'lch : ', score
# score = sim_wordorder(sentence1, sentence2, metric=wn.wup_similarity)
# print 'wup : ', score
# score = sim_wordorder(sentence1, sentence2, metric=wn.res_similarity, ic=brown_ic)
# print 'res - brown  : ', score
# score = sim_wordorder(sentence1, sentence2, metric=wn.res_similarity, ic=semcor_ic)
# print 'res - semcor : ', score
# score = sim_wordorder(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic)
# print 'jcn : ', score
# score = sim_wordorder(sentence1, sentence2, metric=wn.lin_similarity, ic=brown_ic)
# print 'lin : ', score

# # Sample results:
# # sentence1 = list1[0]
# # sentence2 = list2[1]
# # path:  0.19306769657
# # lch :  0.250567652287
# # wup :  0.256855951338
# # res - brown  :  0.252420961067
# # res - semcor :  0.252420961067
# # jcn :  0.180138258853
# # lin :  0.310539825618
text2 = "I am reading that privacy policy"

texts = ["We do not rent, sell, or share any of this information with third party companies.", 
        "We do not rent, sell, or share any information about the user with any third-parties. ",
        "We do not, under any circumstances, share, sell or rent your information to anyone. ",
        "We never share or sell your personal information", "We neither rent nor sell your Personal Information to anyone", 
        "As a general rule, Blizzard will not forward your information to a third party without your permission."]
    
    # print extractor(text)
    # tokens = wt(text)
    # print tokens
    # pos_line = pos_tag(tokens)
    # print pos_line
    # tagged = pos_tagging([text])
    # print tagged
print wt(text)
print pos_tag(wt(text))
print stemming([text])
print pos_lemmatizing([text])
print select_by_pos([text])
print negation_bigram([text])
print term_extraction([text])
print pos_bagging([text])
print pos_tagging([text])
print sem_firstsense([text])
print sem_wsd_sentence([text])

# stemmer = PorterStemmer()
# lemmatizer = WordNetLemmatizer()

# # terms = ["best", "better", "goods"]
def negation_bigram(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    POS tag the line, match patterns of negations to form bigram terms

    Return: neg_bigram_list (list of strings(terms that meets the POS criteria))
    """
    neg_verb_set = ['not', 'never', 'neither']
    neg_noun_set = ['without']
    verb_window = 10
    noun_window = 3

    neg_bigram_list = []
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        
        # Having punctuation removal before POS seems to be a bad idea
        # # remove punctuation
        # # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # # line = ''.join([c for c in line 
        #                                     # if re.match("[a-z\-\' \n\t]", c)])
        # # this solve the problem above:
        # line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        
        # tokenize
        line_token = wt(line)
        # base for return
        neg_bigram_line = line_token
        # POS 
        pos_line = pos_tag(line_token)
        
        # first iteration to find flag words
        neg_verb = None
        neg_verb_flag = None
        neg_noun_flag = None
        for i, tagged_tuple in enumerate(pos_line):
            term = tagged_tuple[0]
            if term in neg_verb_set:
                neg_verb_flag = i
                neg_verb = term
            elif term in neg_noun_set:
                neg_noun_flag = i

        # second iteration to find neg_verb match and form bigram
        if neg_verb_flag != None:
            for i, tagged_tuple in enumerate(pos_line):
                term = tagged_tuple[0]
                tag  = tagged_tuple[1]
                if (i-neg_verb_flag)<=verb_window and (i-neg_verb_flag)>0 and tag.startswith('V'):
                    neg_bigram_line.append(neg_verb+term)

        # third iteration to find neg_noun match and form bigram
        if neg_noun_flag != None:
            for i, tagged_tuple in enumerate(pos_line):
                term = tagged_tuple[0]
                tag  = tagged_tuple[1]
                if (i-neg_noun_flag)<=noun_window and (i-neg_noun_flag)>0 and tag.startswith('N'):
                    neg_bigram_line.append("without"+term)
        
        # back to sentence as a string
        neg_bigram_sentence = ' '.join(neg_bigram_line)
        neg_bigram_list.append(neg_bigram_sentence)
    return neg_bigram_list