def bag_of_words(string, pos_tagging=False, stopwords=False, len_gt=False, lemmatisation=False, stemming=False): #outputs a BoW vector for given string string = string.lower() string = re.sub('[^\w\s]', ' ', string) #remove non-words words = token(string) #tokenise string if pos_tagging: #consider only nouns, adjectives words = [i for i in words if pos([i])[0][1] in GOOD_POS] if stopwords: words = [ i for i in words if (i not in STOP_WORDS and i not in NPTEL_STOP_WORDS) ] if len_gt: words = [i for i in words if len(i) > 2] if lemmatisation: words = [LEMMATE.lemmatize(i) for i in words] if stemming: words = [SNOW.stem(i) for i in words] bag = {} for i in words: if i in bag: bag[i] += 1 else: bag[i] = 1 return bag, len(words)
def co_occurrence_analysis( string, concept_words, concept_lengths, concept_indices, word_relations={}, window_size=100 ): #find related words, not being used in the final implementation? print 'Co-occurrence analysis to find related concepts...' string = string.lower() string = re.sub('[^\w\s]', ' ', string) #remove non-words words = token(string) #tokenise string new_words = [] i = 0 while i < len(words): found = False for j in range(len(concept_words)): if i in concept_indices[j]: new_words += [concept_words[j]] i += concept_lengths[j] found = True break if not found: new_words += [words[i]] i += 1 words = [ i for i in words if (i not in STOP_WORDS and i not in NPTEL_STOP_WORDS) ] #remove stop words (english and nptel) words = [i for i in words if len(i) > 2] #remove short words #words = [LEMMATE.lemmatize(i) for i in words] #lemmatisation #words = [SNOW.stem(i) for i in words] #stemming check_done = set() for i in range(len(words)): for j in range(max(0, i - window_size), min(i + window_size, len(words))): if words[i] == words[j] or words[i] not in concept_words or words[ j] not in concept_words: continue if words[i] in word_relations: if words[j] in word_relations[words[i]]: if (words[i], words[j]) in check_done: word_relations[words[i]][words[j]][-1] += 1 else: word_relations[words[i]][words[j]] += [1] check_done.add((words[i], words[j])) else: word_relations[words[i]][words[j]] = [1] else: word_relations[words[i]] = {words[j]: [1]} check_done.add((words[i], words[j])) return word_relations
def get_text_rank( string, k=8, n_occurr=2, d=0.85, thresh=0.001): #returns top k (keyword, score) pairs based on TextRank string = string.lower() string = re.sub('[^\w\s]', ' ', string) #remove non-words words = token(string) #tokenise string words = [i for i in words if pos([i])[0][1] in GOOD_POS] #part of speech filtering words = [ i for i in words if (i not in STOP_WORDS and i not in NPTEL_STOP_WORDS) ] #remove stop words (english and nptel) words = [LEMMATE.lemmatize(i) for i in words] #lemmatisation words = [SNOW.stem(i) for i in words] #stemming unique = {} reverse = {} num_words = 0 for i in words: if i not in unique: unique[i] = num_words reverse[num_words] = i num_words += 1 print 'Running text rank on', num_words, 'unique words...' graph = [[0 for i in range(num_words)] for i in range(num_words)] #Form co-occurrence matrix for i in range(len(words)): for j in range(max(0, i - n_occurr), min(i + n_occurr, len(words))): graph[unique[words[i]]][unique[words[j]]] = 1 #Iterate and perform text rank err = float('inf') score = [1 for i in range(num_words)] links = [sum(x) for x in graph] while err > thresh: new_score = [(1 - d) + d * sum([ score[j] / links[j] for j in range(num_words) if graph[i][j] == 1 ]) for i in range(num_words)] #Page rank inspired formula err = sum([abs(new_score[i] - score[i]) for i in range(num_words)]) / num_words score = new_score[:] keywords = [(reverse[i], score[i]) for i in range(num_words)] keywords = sorted(keywords, key=lambda x: x[1], reverse=True) if (k > 0): keywords = keywords[:k] return keywords
def find_stop_words( directory, tfidf_cutoff=0.0005): #finds stopwords from a corpus of documents transcripts = listdir(directory) print 'Finding stop words from', len(transcripts), 'documents:' doc_freq = {} all_vocab = [] num_terms = [] for doc in transcripts: print doc fd = open(directory + '/' + doc) text = fd.read().lower() fd.close() text = re.sub('[^\w\s]', ' ', text) #remove non-words words = token(text) words = [LEMMATE.lemmatize(i) for i in words] #lemmatisation words = [SNOW.stem(i) for i in words] #stemming num_terms.append(len(words)) vocab = {} for i in words: if i in vocab: vocab[i] += 1 else: vocab[i] = 1 for i in vocab.keys(): if i in doc_freq: doc_freq[i] += 1 else: doc_freq[i] = 1 all_vocab.append(vocab) stop_words = [] for term in doc_freq.keys(): idf = log(len(transcripts) / doc_freq[term]) tf_idf = [(float(all_vocab[i][term]) / num_terms[i]) * idf if term in all_vocab[i] else 0 for i in range(len(transcripts))] print(tf_idf) cutoff = [tf_idf[i] > tfidf_cutoff for i in range(len(tf_idf))] if (any(cutoff)): stop_words.append(term) print(stop_words)
def candidate_concept_phrases( string, thresh=5 ): #given a string, does POS pattern matching and uses threshold occurrence heuristics to output a candidate set of concept phrases print 'Finding candidate concept phrases...' string = string.lower() string = re.sub('[^\w\s]', ' ', string) #remove non-words words = token(string) #tokenise string tree = CHUNK.parse(pos(words)) list_concepts = [] set_concepts = [] for a in tree: if str(type(a)) == "<class 'nltk.tree.Tree'>": if a.label() == 'CONCEPT': concept = [] for i in range(len(a)): concept += [a[i][0]] c = check_occurs(concept, words, thresh) if c[0] and (concept not in set_concepts): list_concepts.append((concept, c[1])) set_concepts.append(concept) return list_concepts
def line(): [print('-', end='') for i in range(50)] print('') #IMPORT DATASET #------------------------------------------------------------------------------------- print('1. IMPORT DATASET') line() DATA_RAW = [] for j in range(0, 717): f = open(str(j + 1) + '.txt', 'r').read() DATA_RAW.append(f.replace('\n', ' ')) concat = np.concatenate([token(i) for i in DATA_RAW]) N = len(DATA_RAW) V_RAW = len(set(concat)) print('NUMBER OF DOCUMENTS:', N) print('NUMBER OF FEATURE :', V_RAW, '\n') #PRE-PROCESSING #------------------------------------------------------------------------------------- print('2. PREPROCESSING') line() swords = stopwords.words('english') + list(string.punctuation) stemmer = SnowballStemmer('english') junk = tuple(string.punctuation) + tuple([str(k) for k in range(10)]) + tuple('¿')
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory import string, numpy as np ST = StemmerFactory() stemmer = ST.create_stemmer() SW = StopWordRemoverFactory() stop_word = SW.get_stop_words() #rawdata print('rawdata') print(rawdata) doc = [] for i in rawdata: temp = [] for j in token(i): word = stemmer.stem(str.lower(j)) #if word not in stop_word and len(word) > 2 and not word.startswith(tuple(string.punctuation)+tuple([str(k) for k in range(10)])+tuple('¿')): temp.append(word) doc.append(temp) dictionary = [] for i in doc: for j in i: if j not in dictionary: dictionary.append(j) #dictionary print('dictionary') print(dictionary)
def segment_transcript( all_string, stringency=0.5, len_wt=0.8, len_mu=8, len_sigma=2): #dynamic programming method to segment transcripts all_string = re.sub('\n+', '', all_string) sections = re.split('[\s]*\([\s]*refer.*?\)[\s]*', all_string, flags=re.IGNORECASE) segmented = '' for string in sections: try: sentences = sents(string) if ( len(sentences) < 3 ): #too short sentences are clubbed with the following paragraph raise Exception string = string.lower() string = re.sub('[^\w\s]', ' ', string) #remove non-words words = token(string) #tokenise string words = [ i for i in words if (i not in STOP_WORDS and i not in NPTEL_STOP_WORDS) ] #remove stop words words = list(set([i for i in words if len(i) > 2])) #remove trailing words wrd_matrix = [[0 for i in range(len(words))] for j in range(len(sentences))] for i in range(len(sentences)): for j in range(len(words)): if words[j] in sentences[i]: wrd_matrix[i][j] = 1 sim_matrix = [[0 for i in range(len(sentences))] for j in range(len(sentences))] for wrd in wrd_matrix: occurs_in = [i for i in range(len(sentences)) if wrd[i] == 1] for i in occurs_in: for j in occurs_in: sim_matrix[i][j] = 1 sim_matrix[j][i] = 1 #init density_matrix = [[0.0]] + [[0.0] + [ sum([sum(sim_matrix[k][j:i + 1]) for k in range(j, i + 1)]) / (i - j + 1)**stringency for j in range(i + 1) ] for i in range(len(sentences))] cost_matrix = [float('inf') for i in range(len(sentences) + 1)] cost_matrix[0] = 0 asgn_matrix = [0 for i in range(len(sentences) + 1)] #minimisation for i in range(1, len(sentences) + 1): for j in range(i): new_cost = cost_matrix[j] - ( 1 - len_wt) * density_matrix[i][j] + len_wt * ( (i - j - len_mu)**2 / (2 * len_sigma**2)) if (new_cost < cost_matrix[i]): cost_matrix[i] = new_cost asgn_matrix[i] = j #backtracking brkpts = [len(sentences)] while (asgn_matrix[brkpts[0]] > 0): brkpts = [asgn_matrix[brkpts[0]]] + brkpts brkpts = [i - 1 for i in brkpts] prev = 0 for i in brkpts: for j in range(prev, i): segmented += sentences[j] + ' ' segmented += sentences[i] + '\n\n' prev = i + 1 except: segmented += string return segmented
from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer import string, math, numpy as np, pandas as pd def line(): [print('-',end='') for i in range(50)]; print('') #IMPORT DATASET #------------------------------------------------------------------------------------- print('1. IMPORT DATASET'); line() rawdata = [] for j in range(0,717): x = open(str(j+1)+'.txt','r').read() rawdata.append(x.replace('\n',' ')) dtoken = [token(i) for i in rawdata] concat = np.concatenate(dtoken) print('NUMBER OF DOCUMENTS:',len(rawdata)) print('NUMBER OF FEATURE :',len(set(concat)),'\n') #PRE-PROCESSING #------------------------------------------------------------------------------------- print('2. PREPROCESSING'); line() stop_word = stopwords.words('english')+list(string.punctuation) stemmer = SnowballStemmer('english') junk = tuple(string.punctuation)+tuple([str(k) for k in range(10)])+tuple('¿') doc=[] for i in rawdata: