Exemple #1
0
def bag_of_words(string,
                 pos_tagging=False,
                 stopwords=False,
                 len_gt=False,
                 lemmatisation=False,
                 stemming=False):  #outputs a BoW vector for given string
    string = string.lower()
    string = re.sub('[^\w\s]', ' ', string)  #remove non-words
    words = token(string)  #tokenise string
    if pos_tagging:  #consider only nouns, adjectives
        words = [i for i in words if pos([i])[0][1] in GOOD_POS]
    if stopwords:
        words = [
            i for i in words
            if (i not in STOP_WORDS and i not in NPTEL_STOP_WORDS)
        ]
    if len_gt:
        words = [i for i in words if len(i) > 2]
    if lemmatisation:
        words = [LEMMATE.lemmatize(i) for i in words]
    if stemming:
        words = [SNOW.stem(i) for i in words]
    bag = {}
    for i in words:
        if i in bag:
            bag[i] += 1
        else:
            bag[i] = 1
    return bag, len(words)
Exemple #2
0
def co_occurrence_analysis(
    string,
    concept_words,
    concept_lengths,
    concept_indices,
    word_relations={},
    window_size=100
):  #find related words, not being used in the final implementation?
    print 'Co-occurrence analysis to find related concepts...'
    string = string.lower()
    string = re.sub('[^\w\s]', ' ', string)  #remove non-words
    words = token(string)  #tokenise string
    new_words = []
    i = 0
    while i < len(words):
        found = False
        for j in range(len(concept_words)):
            if i in concept_indices[j]:
                new_words += [concept_words[j]]
                i += concept_lengths[j]
                found = True
                break
        if not found:
            new_words += [words[i]]
            i += 1
    words = [
        i for i in words if (i not in STOP_WORDS and i not in NPTEL_STOP_WORDS)
    ]  #remove stop words (english and nptel)
    words = [i for i in words if len(i) > 2]  #remove short words
    #words = [LEMMATE.lemmatize(i) for i in words] #lemmatisation
    #words = [SNOW.stem(i) for i in words] #stemming
    check_done = set()
    for i in range(len(words)):
        for j in range(max(0, i - window_size), min(i + window_size,
                                                    len(words))):
            if words[i] == words[j] or words[i] not in concept_words or words[
                    j] not in concept_words:
                continue
            if words[i] in word_relations:
                if words[j] in word_relations[words[i]]:
                    if (words[i], words[j]) in check_done:
                        word_relations[words[i]][words[j]][-1] += 1
                    else:
                        word_relations[words[i]][words[j]] += [1]
                        check_done.add((words[i], words[j]))
                else:
                    word_relations[words[i]][words[j]] = [1]
            else:
                word_relations[words[i]] = {words[j]: [1]}
            check_done.add((words[i], words[j]))
    return word_relations
Exemple #3
0
def get_text_rank(
        string,
        k=8,
        n_occurr=2,
        d=0.85,
        thresh=0.001):  #returns top k (keyword, score) pairs based on TextRank
    string = string.lower()
    string = re.sub('[^\w\s]', ' ', string)  #remove non-words
    words = token(string)  #tokenise string
    words = [i for i in words
             if pos([i])[0][1] in GOOD_POS]  #part of speech filtering
    words = [
        i for i in words if (i not in STOP_WORDS and i not in NPTEL_STOP_WORDS)
    ]  #remove stop words (english and nptel)
    words = [LEMMATE.lemmatize(i) for i in words]  #lemmatisation
    words = [SNOW.stem(i) for i in words]  #stemming
    unique = {}
    reverse = {}
    num_words = 0
    for i in words:
        if i not in unique:
            unique[i] = num_words
            reverse[num_words] = i
            num_words += 1
    print 'Running text rank on', num_words, 'unique words...'
    graph = [[0 for i in range(num_words)] for i in range(num_words)]
    #Form co-occurrence matrix
    for i in range(len(words)):
        for j in range(max(0, i - n_occurr), min(i + n_occurr, len(words))):
            graph[unique[words[i]]][unique[words[j]]] = 1
    #Iterate and perform text rank
    err = float('inf')
    score = [1 for i in range(num_words)]
    links = [sum(x) for x in graph]
    while err > thresh:
        new_score = [(1 - d) + d * sum([
            score[j] / links[j] for j in range(num_words) if graph[i][j] == 1
        ]) for i in range(num_words)]  #Page rank inspired formula
        err = sum([abs(new_score[i] - score[i])
                   for i in range(num_words)]) / num_words
        score = new_score[:]
    keywords = [(reverse[i], score[i]) for i in range(num_words)]
    keywords = sorted(keywords, key=lambda x: x[1], reverse=True)
    if (k > 0):
        keywords = keywords[:k]
    return keywords
Exemple #4
0
def find_stop_words(
        directory,
        tfidf_cutoff=0.0005):  #finds stopwords from a corpus of documents
    transcripts = listdir(directory)
    print 'Finding stop words from', len(transcripts), 'documents:'
    doc_freq = {}
    all_vocab = []
    num_terms = []
    for doc in transcripts:
        print doc
        fd = open(directory + '/' + doc)
        text = fd.read().lower()
        fd.close()
        text = re.sub('[^\w\s]', ' ', text)  #remove non-words
        words = token(text)
        words = [LEMMATE.lemmatize(i) for i in words]  #lemmatisation
        words = [SNOW.stem(i) for i in words]  #stemming
        num_terms.append(len(words))
        vocab = {}
        for i in words:
            if i in vocab:
                vocab[i] += 1
            else:
                vocab[i] = 1
        for i in vocab.keys():
            if i in doc_freq:
                doc_freq[i] += 1
            else:
                doc_freq[i] = 1
        all_vocab.append(vocab)
    stop_words = []
    for term in doc_freq.keys():
        idf = log(len(transcripts) / doc_freq[term])
        tf_idf = [(float(all_vocab[i][term]) / num_terms[i]) *
                  idf if term in all_vocab[i] else 0
                  for i in range(len(transcripts))]
        print(tf_idf)
        cutoff = [tf_idf[i] > tfidf_cutoff for i in range(len(tf_idf))]
        if (any(cutoff)):
            stop_words.append(term)
    print(stop_words)
Exemple #5
0
def candidate_concept_phrases(
    string,
    thresh=5
):  #given a string, does POS pattern matching and uses threshold occurrence heuristics to output a candidate set of concept phrases
    print 'Finding candidate concept phrases...'
    string = string.lower()
    string = re.sub('[^\w\s]', ' ', string)  #remove non-words
    words = token(string)  #tokenise string
    tree = CHUNK.parse(pos(words))
    list_concepts = []
    set_concepts = []
    for a in tree:
        if str(type(a)) == "<class 'nltk.tree.Tree'>":
            if a.label() == 'CONCEPT':
                concept = []
                for i in range(len(a)):
                    concept += [a[i][0]]
                c = check_occurs(concept, words, thresh)
                if c[0] and (concept not in set_concepts):
                    list_concepts.append((concept, c[1]))
                    set_concepts.append(concept)
    return list_concepts
Exemple #6
0
def line():
    [print('-', end='') for i in range(50)]
    print('')


#IMPORT DATASET
#-------------------------------------------------------------------------------------
print('1. IMPORT DATASET')
line()
DATA_RAW = []
for j in range(0, 717):
    f = open(str(j + 1) + '.txt', 'r').read()
    DATA_RAW.append(f.replace('\n', ' '))

concat = np.concatenate([token(i) for i in DATA_RAW])
N = len(DATA_RAW)
V_RAW = len(set(concat))

print('NUMBER OF DOCUMENTS:', N)
print('NUMBER OF FEATURE  :', V_RAW, '\n')

#PRE-PROCESSING
#-------------------------------------------------------------------------------------
print('2. PREPROCESSING')
line()

swords = stopwords.words('english') + list(string.punctuation)
stemmer = SnowballStemmer('english')
junk = tuple(string.punctuation) + tuple([str(k)
                                          for k in range(10)]) + tuple('¿')
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import string, numpy as np

ST = StemmerFactory()
stemmer = ST.create_stemmer()
SW = StopWordRemoverFactory()
stop_word = SW.get_stop_words()

#rawdata
print('rawdata')
print(rawdata)

doc = []
for i in rawdata:
    temp = []
    for j in token(i):
        word = stemmer.stem(str.lower(j))
        #if word not in stop_word and len(word) > 2 and not word.startswith(tuple(string.punctuation)+tuple([str(k) for k in range(10)])+tuple('¿')):
        temp.append(word)
    doc.append(temp)

dictionary = []
for i in doc:
    for j in i:
        if j not in dictionary:
            dictionary.append(j)

#dictionary
print('dictionary')
print(dictionary)
Exemple #8
0
def segment_transcript(
        all_string,
        stringency=0.5,
        len_wt=0.8,
        len_mu=8,
        len_sigma=2):  #dynamic programming method to segment transcripts
    all_string = re.sub('\n+', '', all_string)
    sections = re.split('[\s]*\([\s]*refer.*?\)[\s]*',
                        all_string,
                        flags=re.IGNORECASE)
    segmented = ''
    for string in sections:
        try:
            sentences = sents(string)
            if (
                    len(sentences) < 3
            ):  #too short sentences are clubbed with the following paragraph
                raise Exception
            string = string.lower()
            string = re.sub('[^\w\s]', ' ', string)  #remove non-words
            words = token(string)  #tokenise string
            words = [
                i for i in words
                if (i not in STOP_WORDS and i not in NPTEL_STOP_WORDS)
            ]  #remove stop words
            words = list(set([i for i in words
                              if len(i) > 2]))  #remove trailing words
            wrd_matrix = [[0 for i in range(len(words))]
                          for j in range(len(sentences))]
            for i in range(len(sentences)):
                for j in range(len(words)):
                    if words[j] in sentences[i]:
                        wrd_matrix[i][j] = 1
            sim_matrix = [[0 for i in range(len(sentences))]
                          for j in range(len(sentences))]
            for wrd in wrd_matrix:
                occurs_in = [i for i in range(len(sentences)) if wrd[i] == 1]
                for i in occurs_in:
                    for j in occurs_in:
                        sim_matrix[i][j] = 1
                        sim_matrix[j][i] = 1
            #init
            density_matrix = [[0.0]] + [[0.0] + [
                sum([sum(sim_matrix[k][j:i + 1])
                     for k in range(j, i + 1)]) / (i - j + 1)**stringency
                for j in range(i + 1)
            ] for i in range(len(sentences))]
            cost_matrix = [float('inf') for i in range(len(sentences) + 1)]
            cost_matrix[0] = 0
            asgn_matrix = [0 for i in range(len(sentences) + 1)]
            #minimisation
            for i in range(1, len(sentences) + 1):
                for j in range(i):
                    new_cost = cost_matrix[j] - (
                        1 - len_wt) * density_matrix[i][j] + len_wt * (
                            (i - j - len_mu)**2 / (2 * len_sigma**2))
                    if (new_cost < cost_matrix[i]):
                        cost_matrix[i] = new_cost
                        asgn_matrix[i] = j
            #backtracking
            brkpts = [len(sentences)]
            while (asgn_matrix[brkpts[0]] > 0):
                brkpts = [asgn_matrix[brkpts[0]]] + brkpts
            brkpts = [i - 1 for i in brkpts]
            prev = 0
            for i in brkpts:
                for j in range(prev, i):
                    segmented += sentences[j] + ' '
                segmented += sentences[i] + '\n\n'
                prev = i + 1
        except:
            segmented += string
    return segmented
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import string, math, numpy as np, pandas as pd

def line():
    [print('-',end='') for i in range(50)]; print('')

#IMPORT DATASET
#-------------------------------------------------------------------------------------
print('1. IMPORT DATASET'); line()
rawdata = []
for j in range(0,717):
    x = open(str(j+1)+'.txt','r').read()
    rawdata.append(x.replace('\n',' '))

dtoken = [token(i) for i in rawdata]
concat = np.concatenate(dtoken)

print('NUMBER OF DOCUMENTS:',len(rawdata))
print('NUMBER OF FEATURE  :',len(set(concat)),'\n')

#PRE-PROCESSING
#-------------------------------------------------------------------------------------
print('2. PREPROCESSING'); line()

stop_word = stopwords.words('english')+list(string.punctuation)
stemmer = SnowballStemmer('english')
junk = tuple(string.punctuation)+tuple([str(k) for k in range(10)])+tuple('¿')

doc=[]
for i in rawdata: