def test_stemming(): wordsToStem = ["tokenized", "roses", "brutally", "dozens", "stupidity", "notorious", "english", "stemming", "stemmed", "soft"] for i in range(0, len(wordsToStem)-1): wordsToStem[i] = preprocessing.stem(wordsToStem[i]) stemmedWords = ["token", "rose", "brutal", "dozen", "stupid", "notori", "english", "stem", "stem", "soft"] for a, b in zip(wordsToStem, stemmedWords): assert(a == b)
def processBody(text): # print('Body: ',text) data = re.sub(r'\{\{.*\}\}', r' ', text) data = tokenise(data) data = remove_stopwords(data) data = stem(data) # print('Body: ',data) return data
def processTitle(title): # print('Title before', title) title = title.lower() title = tokenise(title) title = remove_stopwords(title) title = stem(title) # print('Title: ', title) return title
def processCategories(text): data = text.split('\n') categories = [] for line in data: if re.match(r'\[\[category', line): categories.append(re.sub(r'\[\[category:(.*)\]\]', r'\1', line)) data = tokenise(' '.join(categories)) data = remove_stopwords(data) data = stem(data) # print('Categories: ', data) return data
def processLinks(text): data = text.split('\n') links = [] for line in data: if re.match(r'\*[\ ]*\[', line): links.append(line) data = tokenise(' '.join(links)) data = remove_stopwords(data) data = stem(data) # print('Links: ', ) return data
def advQueryProcessing(self, engine): """Processing based on self.advOptions values options is a dictionary of following terms "allterms" "songname" "songend" "artist", "artistend" "genre" "pos" "from" "to" Arguments: engine -- object to which the Query object belongs """ print("******Advanced query processing******") starttime = datetime.datetime.now() print("Start time", str(starttime)) print("Query---", self.queryText) self.queryEngine = engine q_tokens = tokenize(self.queryText) #tokenize queryText qs_tokens, qs_dict = stem(q_tokens) # if len(qs_tokens) == 0: ## uncomment to disallow blank query text #engine.noResult = True #return # all the documents satisfying the criteria querydocs = self.getAdvResults(qs_tokens) #Ranking based on score songHeap = [] for doc in querydocs: if (type(querydocs) == type( [])): # case where score is not available song = selSong(doc, {}) else: song = selSong( doc, querydocs[doc]) # calculates score document at a time songHeap.append(song) if len(songHeap) == 0: #engine.noResult = True self.noResult = True printDuration(starttime) return self.queryResult = songHeap songHeap.sort(reverse=True) #Fetching song details count = engine.displayLength self.nextSongListPrep(0, count) # retrieval time printDuration(starttime)
def build_indexes(): """Indexing the documents and updating into db""" print("Building all indexes ") conn = create_connection(dbname) cur = conn.cursor() cur.execute( "CREATE TABLE IF NOT EXISTS terms(term TEXT PRIMARY KEY,cfreq INTEGER,dfreq INTEGER)" ) cur.execute( """CREATE TABLE IF NOT EXISTS termdoc(term INTEGER,docid INTEGER, tfreq INTEGER,dscore REAL,posList TEXT, FOREIGN KEY (term)REFERENCES terms(term), FOREIGN KEY (docid)REFERENCES songs(id), PRIMARY KEY (term, docid))""") cur.execute("""CREATE TABLE IF NOT EXISTS permArtist(key Text,docid Text, FOREIGN KEY (docid)REFERENCES songs(id), PRIMARY KEY (key))""") cur.execute("""CREATE TABLE IF NOT EXISTS permName(key Text,docid Text, FOREIGN KEY (docid)REFERENCES songs(id), PRIMARY KEY (key))""") cur.execute( """CREATE TABLE IF NOT EXISTS genreDoc(genre Text,docid INTEGER, FOREIGN KEY (docid)REFERENCES songs(id), PRIMARY KEY (genre, docid))""") cur.execute("""CREATE TABLE IF NOT EXISTS yearDoc(year Text,docid INTEGER, FOREIGN KEY (docid)REFERENCES songs(id), PRIMARY KEY (year, docid))""") cur.execute('SELECT * FROM songs') for row in cur: tokens = tokenize(row[5]) if not tokens: continue stem_tokens, term_dict_local = stem(tokens) #PorterStemmer updateTermTable(row[0], term_dict_local, conn) updateGenreYear(row[4], row[2], row[0], conn) permuterm(row[3], row[0], conn, "permArtist") # perm artist permuterm(row[1], row[0], conn, "permName") #Name print("calculating tfidf") calculate_Tf_Idf(cur, conn) now = datetime.datetime.now() print(str(now)) conn.commit() conn.close() now = datetime.datetime.now() print(str(now))
def queryProcessing(self, engine): """query processing for basic Search Arguments: engine -- object to which the Query object belongs """ self.queryEngine = engine print("*********Query processing********") starttime = datetime.datetime.now() print("Start time", str(starttime)) print("Query---", self.queryText) q_tokens = tokenize(self.queryText) #tokenize queryText qs_tokens, qs_dict = stem(q_tokens) if len(qs_tokens) == 0: #engine.noResult = True self.noResult = True #choose one return # all the documents that contain any of the terms querydocs = self.getIndexes(qs_tokens) #Ranking based on score songHeap = [] for doc in querydocs: #scoring document at a time song = selSong(doc, querydocs[doc]) songHeap.append(song) if len(songHeap) == 0: #engine.noResult = True self.noResult = True return self.queryResult = songHeap songHeap.sort(reverse=True) #Fetching song details start = 0 count = None if self.topSearch == True: count = 1 else: count = engine.displayLength self.nextSongListPrep(0, count) # retrieval time printDuration(starttime)
def processInfo(text): data = text.split('\n') flag = -1 info = [] st = "}}" for line in data: if re.match(r'\{\{infobox', line): info.append(re.sub(r'\{\{infobox(.*)', r'\1', line)) flag = 0 elif flag == 0: if line == st: flag = -1 continue info.append(line) data = tokenise(' '.join(info)) data = remove_stopwords(data) data = stem(data) # print("Info: ", data) return data
def run(infile, vec_size, name="narr+ice+medhelp", stem=False): bin_dir = "/u/sjeblee/tools/word2vec/word2vec/bin" #bin_dir = "/u/yoona/word2vec/bin" # Input data data_dir = "/u/sjeblee/research/va/data/datasets/mds+rct/crossval_sets" #data_dir = "/u/yoona/test/mds+rct" # hard-coded, Yoona's data location ice_data = "/u/sjeblee/research/data/ICE-India/Corpus/all-lower.txt" #ice_data = "/u/yoona/mds+rct/ice_all_lower.txt" medhelp_data = "/u/sjeblee/research/data/medhelp/all_medhelp_clean_lower.txt" #medhelp_data = "/u/yoona/mds+rct/all_medhelp_clean_lower.txt" suffix = ".narrsent" if stem: ice_data = "/u/sjeblee/research/data/ICE-India/Corpus/all-lower-stem.txt" medhelp_data = "/u/sjeblee/research/data/medhelp/all_medhelp_clean_stem.txt" suffix = ".narrsent.stem" # Output data text_data = data_dir + "/" + name + ".txt" vec_data = data_dir + "/" + name + ".vectors." + str(vec_size) if stem: text_data = data_dir + "/" + name + "_stem.txt" vec_data = data_dir + "/" + name + "_stem.vectors." + str(vec_size) # Quit if vectors already exist if os.path.exists(vec_data): print "Vectors already exist, quitting" return vec_data # Extract narrative text from input file narrs = extract_features.get_narrs(infile) train_data = infile + suffix outfile = open(train_data, "w") for narr in narrs: if stem: narr = preprocessing.stem(narr) outfile.write(narr + "\n") outfile.close() # Combine all the text #filenames = [ice_data, medhelp_data, train_data] filenames = [train_data] #filenames = [medhelp_data, train_data] sentences = [] outfile = open(text_data, "w") for fname in filenames: with open(fname) as inf: for line in inf.readlines(): line = unicode(line, errors='ignore') outfile.write(line.strip().encode('utf8')) sentences.append(line.strip()) outfile.close() #rm -f $VECTOR_DATA window_size = 5 num_threads = 12 #sentences = [] #with open(outfile) as f: # for line in f.readlines(): # sentences.append(line.strip()) print "-- Training vectors..." #vec_model = Word2Vec(sentences, size=int(vec_size), window=window_size, min_count=1, workers=num_threads, negative=0, sg=1) #vec_model = FastText(sentences, size=int(vec_size), window=window_size, min_count=1, word_ngrams=1, min_n=2, max_n=6, workers=num_threads, negative=0) #vec_model.save(vec_data) #vec_model.wv.save_word2vec_format(vec_data + ".vec") if not os.path.exists(vec_data): print "--------------------------------------------------------------------" process = subprocess.Popen([ "time", bin_dir + "/word2vec", "-train", text_data, "-output", vec_data, "-cbow", "1", "-size", str(vec_size), "-window", str(window_size), "-negative", "0", "-hs", "1", "-min-count", "1", "-sample", "1e-3", "-threads", str(num_threads), "-binary", "0" ], stdout=subprocess.PIPE) output, err = process.communicate() print output #time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 0 -size $DIM -window 5 -negative 0 -hs 1 -min-count 1 -sample 1e-3 -threads 12 -binary 0 print "-- Training binary vectors..." process = subprocess.Popen([ "time", bin_dir + "/word2vec", "-train", text_data, "-output", vec_data + ".bin", "-cbow", "1", "-size", str(vec_size), "-window", str(window_size), "-negative", "0", "-hs", "1", "-min-count", "1", "-sample", "1e-3", "-threads", str(num_threads), "-binary", "1" ], stdout=subprocess.PIPE) output, err = process.communicate() print output #time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA.bin -cbow 0 -size $DIM -window 5 -negative 0 -hs 1 -min-count 1 -sample 1e-3 -threads 12 -binary 1 print "-------------------------------------------------------------------------" #echo -- distance... #$BIN_DIR/distance $VECTOR_DATA.bin return vec_data
def extract(infile, outfile, dict_keys, stem=False, lemma=False, element="narrative", arg_rebalance=""): train = False narratives = [] keywords = [] # Get the xml from file root = etree.parse(infile).getroot() if dict_keys == None: train = True # Set up the keys for the feature vector dict_keys = ["MG_ID", labelname] if checklist in featurenames: dict_keys = dict_keys + ["CL_DeathAge", "CL_ageunit", "CL_DeceasedSex", "CL_Occupation", "CL_Marital", "CL_Hypertension", "CL_Heart", "CL_Stroke", "CL_Diabetes", "CL_TB", "CL_HIV", "CL_Cancer", "CL_Asthma","CL_InjuryHistory", "CL_SmokeD", "CL_AlcoholD", "CL_ApplytobaccoD"] elif dem in featurenames: dict_keys = dict_keys + ["CL_DeathAge", "CL_DeceasedSex"] print "dict_keys: " + str(dict_keys) #keywords = set([]) #narrwords = set([]) print "train: " + str(train) print "stem: " + str(stem) print "lemma: " + str(lemma) # Extract features matrix = [] for child in root: features = {} if rec_type in featurenames: features["CL_" + rec_type] = child.tag # CHECKLIST features for key in dict_keys: if key[0:3] == "CL_": key = key[3:] item = child.find(key) value = "0" if item != None: value = item.text if key == "AlcoholD" or key == "ApplytobaccoD": if value == 'N': value = 9 features[key] = value #print "-- value: " + value #if key == "MG_ID": # print "extracting features from: " + value # KEYWORD features if kw_features: keyword_string = get_keywords(child) # Remove punctuation and trailing spaces from keywords words = [s.strip().translate(string.maketrans("",""), string.punctuation) for s in keyword_string.split(',')] # Split keyword phrases into individual words for word in words: w = word.split(' ') words.remove(word) for wx in w: words.append(wx.strip().strip('–')) keywords.append(" ".join(words)) # NARRATIVE features if narr_features or ((not train) and (symp_train in featurenames)): narr_string = "" item = child.find(element) if item != None: if item.text != None: narr_string = item.text.encode("utf-8") else: print "warning: empty narrative" narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')] text = " ".join(narr_words) if stem: narr_string = preprocessing.stem(text) elif lemma: narr_string = preprocessing.lemmatize(text) narratives.append(narr_string.strip().lower()) #print "Adding narr: " + narr_string.lower() # SYMPTOM features elif train and (symp_train in featurenames): narr_string = "" item = child.find("narrative_symptoms") if item != None: item_text = item.text if item_text != None and len(item_text) > 0: narr_string = item.text.encode("utf-8") #narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')] narratives.append(narr_string.lower()) print "Adding symp_narr: " + narr_string.lower() # Save features matrix.append(features) # Construct the feature matrix # COUNT or TFIDF features if narr_count in featurenames or kw_count in featurenames or narr_tfidf in featurenames or kw_tfidf in featurenames or lda in featurenames or symp_train in featurenames: documents = [] if narr_count in featurenames or narr_tfidf in featurenames or lda in featurenames or symp_train in featurenames: documents = narratives print "narratives: " + str(len(narratives)) elif kw_count in featurenames or kw_tfidf in featurenames: documents = keywords print "keywords: " + str(len(keywords)) # Create count matrix global count_vectorizer if train: print "training count_vectorizer" count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(min_ngram,max_ngram),stop_words=stopwords) count_vectorizer.fit(documents) dict_keys = dict_keys + count_vectorizer.get_feature_names() print "transforming data with count_vectorizer" count_matrix = count_vectorizer.transform(documents) matrix_keys = count_vectorizer.get_feature_names() print "writing count matrix to file" out_matrix = open(infile + ".countmatrix", "w") out_matrix.write(str(count_matrix)) out_matrix.close() # Add count features to the dictionary for x in range(len(matrix)): feat = matrix[x] for i in range(len(matrix_keys)): key = matrix_keys[i] val = count_matrix[x,i] feat[key] = val # Convert counts to TFIDF if (narr_tfidf in featurenames) or (kw_tfidf in featurenames): print "converting to tfidf..." print "matrix_keys: " + str(len(matrix_keys)) # Use the training count matrix for fitting if train: global tfidfTransformer tfidfTransformer = sklearn.feature_extraction.text.TfidfTransformer() tfidfTransformer.fit(count_matrix) # Convert matrix to tfidf tfidf_matrix = tfidfTransformer.transform(count_matrix) print "count_matrix: " + str(count_matrix.shape) print "tfidf_matrix: " + str(tfidf_matrix.shape) # Replace features in matrix with tfidf for x in range(len(matrix)): feat = matrix[x] #values = tfidf_matrix[x,0:] #print "values: " + str(values.shape[0]) for i in range(len(matrix_keys)): key = matrix_keys[i] val = tfidf_matrix[x,i] feat[key] = val # LDA topic modeling features if lda in featurenames: global ldaModel if train: ldaModel = LatentDirichletAllocation(n_topics=num_topics) ldaModel.fit(count_matrix) lda_matrix = ldaModel.transform(count_matrix) for t in range(0,num_topics): dict_keys.append("lda_topic_" + str(t)) for x in range(len(matrix)): for y in range(len(lda_matrix[x])): val = lda_matrix[x][y] matrix[x]["lda_topic_" + str(y)] = val # TODO: Print LDA topics # WORD2VEC features elif narr_vec in featurenames: print "Warning: using word2vec features, ignoring all other features" # Create word2vec mapping word2vec, dim = load_word2vec(vecfile) # Convert words to vectors and add to matrix dict_keys.append(narr_vec) global max_seq_len max_seq_len = 200 #if train: #max_seq_len = 0 print "word2vec dim: " + str(dim) print "initial max_seq_len: " + str(max_seq_len) zero_vec = [] for z in range(0, dim): zero_vec.append(0) for x in range(len(matrix)): narr = narratives[x] #print "narr: " + narr vectors = [] vec = zero_vec for word in narr.split(' '): if len(word) > 0: #if word == "didnt": # word = "didn't" if word in word2vec: vec = word2vec[word] vectors.append(vec) length = len(vectors) if length > max_seq_len: #if train: # max_seq_len = length vectors = vectors[(-1*max_seq_len):] (matrix[x])[narr_vec] = vectors # Pad the narr_vecs with 0 vectors print "padding vectors to reach maxlen " + str(max_seq_len) for x in range(len(matrix)): length = len(matrix[x][narr_vec]) matrix[x]['max_seq_len'] = max_seq_len if length < max_seq_len: for k in range(0, max_seq_len-length): matrix[x][narr_vec].insert(0,zero_vec) # use insert for pre-padding # narr_seq for RNN elif narr_seq in featurenames: global vocab_size, max_seq_len if train: dict_keys.append(narr_seq) dict_keys.append('vocab_size') dict_keys.append('max_seq_len') vocab = set() for narr in narratives: words = narr.split(' ') for word in words: vocab.add(word) vocab_size = len(vocab) max_seq_len = 0 sequences = [] # Convert text into integer sequences for x in range(len(matrix)): narr = narratives[x] seq = hashing_trick(narr, vocab_size, hash_function='md5', filters='\t\n', lower=True, split=' ') if len(seq) > max_seq_len: max_seq_len = len(seq) sequences.append(seq) # Pad the sequences sequences = pad_sequences(sequences, maxlen=max_seq_len, dtype='int32', padding='pre') for x in range(len(matrix)): matrix[x]['narr_seq'] = sequences[x] matrix[x]['vocab_size'] = vocab_size matrix[x]['max_seq_len'] = max_seq_len #if arg_rebalance != "": # matrix_re = rebalance_data(matrix, dict_keys, arg_rebalance) # write_to_file(matrix_re, dict_keys, outfile) #else: data_util.write_to_file(matrix, dict_keys, outfile)
def begin_search(): f = open('./inverted_index/fileNumber.txt', 'r') global number_of_files number_of_files = int(f.read().strip()) f.close() query_file = sys.argv[1] with open(query_file, 'r') as q: queries = q.readlines() data = "" for query in queries: global K K = query.split(', ')[0] K = int(K) query = query.split(', ')[1:] temp_query = '' for i in query: temp_query += i + ' ' query = temp_query query = query.lower() start = timeit.default_timer() if re.match(r'[t|b|i|c|l]:', query): tempFields = re.findall(r'([t|b|c|i|l]):', query) words = re.findall(r'[t|b|c|i|l]:([^:]*)(?!\S)', query) # print(tempFields, words) fields, tokens = [], [] si = len(words) i = 0 while i < si: for word in words[i].split(): fields.append(tempFields[i]) tokens.append(word) i += 1 tokens = remove_stopwords(tokens) tokens = stem(tokens) # print(fields, tokens) results = field_query_ranking(tokens, fields) # print(results) else: tokens = tokenise(query) tokens = remove_stopwords(tokens) tokens = stem(tokens) results = simple_query_ranking(tokens) # print(results) if len(results) > 0: results = sorted(results, key=results.get, reverse=True) if (len(results) > K): results = results[:K] for key in results: key.rstrip() title, title_doc_num = find_title(key) data += title_doc_num data += ', ' # print(title_doc_num, end = ' ') if title is not None: for i in title: data += i + ' ' # print(i, end = ' ') data = data[:-1] else: data += "No results found! Try modifying the search by reducing the length maybe?\n" end = timeit.default_timer() data += str(end - start) + ', ' data += str((end - start) / K) data += '\n\n' # print('\n') # print('data', data) with open('queries_op.txt', 'w') as f: f.write(data)