def stemWords(input_tokens): stemmer = PorterStemmer() stemmed_words = [] for token in input_tokens: stemmed_words.append(str(stemmer.stem(token, 0, len(token) - 1))) return stemmed_words
def stemWords(tokens): """Stems tokens.""" stemmer = PorterStemmer() stemmedWords = [] for token in tokens: stemmed = stemmer.stem(token, 0, len(token) - 1) stemmedWords.append(stemmed) return stemmedWords
def porter_stem(corp): """ Builds a dictionary with words as keys and stems as the values. """ from porterstemmer import PorterStemmer ps = PorterStemmer() psdict = {} for w in corp.words: psdict[w] = ps.stem(w) return psdict
def task1(input_file_name, output_file_name, stop_words_list): # open the input file and the list of stop words and create output file f_input = open(input_file_name, "r") f_output = open(output_file_name, "w+") f_stop_words = open(stop_words_list, "r") list_lines = f_input.readlines() #list of stop words list_stop_words = f_stop_words.readlines() list_stop_words = list(map(lambda x: x.strip(), list_stop_words)) #list of document names list_documents = [] ps = PorterStemmer() for i in range(len(list_lines)): list_words = [] #list of words for a line list_words_stemming = [] #list of stemming words for a line list_documents.append(list_lines[i].split()[0]) #remove all the \t and \n list_lines[i] = re.sub(r'\s', " ", list_lines[i]) #change upper cases to lower cases list_lines[i] = list_lines[i].lower() #remove numbers list_lines[i] = list_lines[i].translate(str.maketrans('', '', digits)) #remove punctuations list_lines[i] = re.sub(r'[^a-zA-Z0-9\s]', '', list_lines[i]) for w in list_lines[i].split()[1:]: if w not in list_stop_words: list_words.append(w) for y in list_words: list_words_stemming.append(ps.stem(y, 0, len(y) - 1)) # Write the document name in front of the content in the output file f_output.write(list_documents[i] + "\t") # Write the content of the document in the output file for z in list_words_stemming: f_output.write(z + " ") f_output.write("\n") # Close all the file f_output.close() f_input.close() f_stop_words.close()
def add_tokens(): """Return a posting list of all unique words in the collection.""" # consider only title, data, author, category and post_text columns # reason: the url columns contain redundant information (title) & other columns are # numbers not useful to the vector space model title_file = "TUAW-dataset/data/title.txt" date_file = "TUAW-dataset/data/date.txt" author_file = "TUAW-dataset/data/author.txt" category_file = "TUAW-dataset/data/category.txt" post_text_file = "TUAW-dataset/data/post_text.txt" posting_list = {} stemmer = PorterStemmer() stopwords_set = set(stopwords.words("english")) doc_id = -1 total_num_docs = 0 # read the same line of the files together # open(date_file) as date_fd, \ with open(title_file) as title_fd, \ open(author_file) as author_fd, \ open(category_file) as category_fd, \ open(post_text_file) as post_text_fd: lines = zip(title_fd, author_fd, category_fd, post_text_fd) for line in lines: total_num_docs += 1 doc_id += 1 # == line_num if doc_id % 1000 == 999: print("Processed " + str(doc_id + 1) + " posts") # title + author + category + post_text line_string = line[0].strip() + " " + line[1].strip( ) + " " + line[2].strip() + " " + line[3].strip() # normalize the terms in the line == post term_list = normalize(line_string, stemmer, stopwords_set) # add every word to posting list for word in term_list: # type(posting list) == { term: [df, {doc_id: tf}] } if word in posting_list: doc_dict = posting_list[word][1] if doc_id in doc_dict: doc_dict[doc_id] = doc_dict[doc_id] + 1 else: posting_list[word][0] += 1 doc_dict[doc_id] = 1 elif len( word ) > 0: # add only words of non-zero length, check again temp_dict = {} temp_dict[doc_id] = 1 posting_list[word] = [1, temp_dict] return (total_num_docs, posting_list)
def search(query_string, k, line_num_dict, N): """Return top @k search results for @query_string from the corpus of @N documents using \ @line_num_dict as a lookup table.""" stemmer = PorterStemmer() stopwords_set = set(stopwords.words("english")) # normalize the query term_list = normalize(query_string, stemmer, stopwords_set) query_freq = {} # num of occurences of every unique term for term in term_list: if term in query_freq: query_freq[term] = query_freq[term] + 1 elif len(term) > 0: # add only term of non-zero length query_freq[term] = 1 # retrieve only necessary posting lists in the order they appear in the file lines_to_get = [] for term in query_freq.keys(): lines_to_get += [line_num_dict[term]] lines_to_get.sort() # if no word in the quey occurs in the data, posting list will be empty if len(lines_to_get) == 0: print("No results found") sys.exit(0) posting_list = get_posting_list(lines_to_get) (weight_query, doc_dict) = calc_weights(query_freq, posting_list, N) top_k = get_top_k(weight_query, doc_dict, k) # result = doc_id + score + title + url title_file = "TUAW-dataset/data/title.txt" post_url_file = "TUAW-dataset/data/post_url.txt" # sort based on doc_id for efficient retrieval docs_to_get = [] for doc_id in top_k.keys(): docs_to_get += [doc_id] docs_to_get.sort() current_index = 0 with open(title_file) as title_fd, open(post_url_file) as post_url_fd: lines = zip(title_fd, post_url_fd) for i, line in enumerate(lines): if i == docs_to_get[current_index]: title_string = "Title = " + line[0] post_url_string = "URL = " + line[1] top_k[i][1] = title_string + post_url_string current_index += 1 if current_index == len(docs_to_get): break # sort top_k based on score result = OrderedDict( sorted(top_k.items(), key=lambda t: t[1][0], reverse=True)) # print output num_results = 1 for doc_id, [score, details] in result.items(): print( str(num_results) + ". Doc_ID = " + str(doc_id) + " ; Score = " + str(result[doc_id][0])) print(result[doc_id][1]) num_results += 1
def stemWord(str): stemmer = PorterStemmer() return stemmer.stem(str, 0, len(str) - 1)
def parsetoken(db, line): global documents global tokens global terms # # Create instance of the porterstemmer object we will call the stemmer method in this # object to 'stem' the tokens extracted from the line. # p = PorterStemmer() # this replaces any tab characters with a space character in the line # read from the file line = line.replace('\t', ' ') line = line.strip() #line.encode('ascii', 'ignore') # # This routine splits the contents of the line into tokens l = splitchars(line) # for each token in the line process for elmt in l: # This statement removes the newline character if found elmt = elmt.replace('\n', '') # This statement converts all letters to lower case lowerElmt = elmt.lower().strip() # # Increment the counter of the number of tokens processed. This value will # provide the total size of the corpus in terms of the number of terms in the # entire collection # tokens += 1 # if the token is less than 2 characters in length we assume # that it is not a valid term and ignore it # if len(lowerElmt) < 2: continue # # if the token is in the stopwords list then do not include in the term # dictionary and do not index the term. # if (lowerElmt in stopwords): continue # # This section of code will check to see if the term is a number and will not # add a number to the index. This is accomplished by attempting to convert # the term into an integer and assigning it to a variable. If the term is not # a number meaning it contains non numeric characters this will fail and we can # catch this error and continue processing the term. If the term is a number # it will not fail and we can then ignore the term (the continue statement will # continue with the next item retrieved from the 'for' statement) # try: dummy = int(lowerElmt) except ValueError: # Value is not a number so we can index it stemword = lowerElmt else: # value is a number so we will NOT add it to the index continue # # In this following short section of the code we call the porter stemmer code # that we have included in our indexer process. This algorithm will stem the # the tokens which will reduce the size of our data dictionary. # lowerElmt = p.stem(stemword, 0, len(stemword) - 1) # if the term doesn't currently exist in the term dictionary # then add the term if not (lowerElmt in db.keys()): terms += 1 db[lowerElmt] = Term() db[lowerElmt].termid = terms db[lowerElmt].docids = dict() db[lowerElmt].docs = 0 # if the document is not currently in the postings # list for the term then add it # if not (documents in db[lowerElmt].docids.keys()): db[lowerElmt].docs += 1 db[lowerElmt].docids[documents] = 0 # Increment the counter that tracks the term frequency db[lowerElmt].docids[documents] += 1 return l
def generate_feature_csv( csv_out, csv_in="bechdel_full.csv", female_word_filename=None, female_name_filename=None, verbose=False ): """ Given a csv file csv_in of features, """ if verbose: print("Generating basic features and booleans...") raw_data = pd.read_csv(csv_in) data = pd.DataFrame(index=raw_data.index) data["Bechdel_pass"] = [1 if x == "pass" else 0 for x in raw_data["Bechdel_rating"]] data["Year"] = raw_data["Year"] # Only 2 films have N/A votes and ratings. I think it's OK to just zero # their votes/ratings here data["imdbRating"] = [x if x != "N/A" else 0 for x in raw_data["imdbRating"]] data["imdbVotes"] = [int(re.sub(",", "", x)) if x != "N/A" else 0 for x in raw_data["imdbVotes"]] # Adding booleans for month (not present for all releases). The thinking is # that movie "types" are released in seasons - blockbusters in the summer, # Oscar winners near year's end - and this may impact Bechdel rating. release_months = [ datetime.datetime.strptime(x, "%d %b %Y").month if x != "N/A" else None for x in raw_data["Released"] ] release_months = level_booleans(release_months, "Month", zeros_ones=True) for col in release_months.columns: data[col] = release_months[col] # Booleans for parental rating. Uses the rating_bucket function to deal # with the wide variety of rating types. rating_buckets = [rating_bucket(x) for x in raw_data["Rated"]] rating_buckets = level_booleans(rating_buckets, "Rating", zeros_ones=True) for col in rating_buckets.columns: data[col] = rating_buckets[col] # Genre membership, this was actually easy to process because they're # pretty clean genre_membership = level_booleans(raw_data["Genre"], "Genre", sep=", ", zeros_ones=True) for col in genre_membership.columns: data[col] = genre_membership[col] # Runtime in minutes runtime_re = re.compile("((?P<hr>\d+) h){0,1} {0,1}((?P<min>\d+) min){0,1}") runtime_mins = [] runtime_na = [] for runtime_str in raw_data["Runtime"]: if runtime_str == "N/A": runtime_mins.append(0) runtime_na.append(1) else: runtime_match = runtime_re.match(runtime_str) (runtime_hr, runtime_min) = runtime_match.group("hr"), runtime_match.group("min") if runtime_hr is None: runtime_hr = 0 if runtime_min is None: runtime_min = 0 runtime_mins.append(int(runtime_hr) * 60 + int(runtime_min)) runtime_na.append(0) data["Runtime"] = runtime_mins data["Runtime_na"] = runtime_na if verbose: print("Generating word-based features (stemmed words and female names)...") # Porter-stemmed titles and plot summaries, and look for "female words" # (like 'she', 'woman', etc.) if female_word_filename is not None: ps = PorterStemmer() f = open(female_word_filename, "r") female_stems = set([ps.stem(x.strip().lower(), 0, len(x.strip()) - 1) for x in f]) f.close() has_female_word = [] for plot in raw_data["Title"] + " " + raw_data["Plot"]: if plot == "N/A": has_female_word.append(None) else: cur_has_female_word = 0 plot_clean = re.sub("[^\w\s]", " ", plot).lower().strip() plot_words = re.split("\s+", plot_clean) plot_stems = [ps.stem(x, 0, len(x) - 1) for x in plot_words] for plot_stem in plot_stems: if plot_stem in female_stems: cur_has_female_word = 1 break has_female_word.append(cur_has_female_word) data["Female_word"] = has_female_word # Number of female names in the actor list: 0 or 1 (and anything not # flagged as either should be considered 2+) if female_name_filename is not None: f = open(female_name_filename, "r") female_nameset = set([x.strip().lower() for x in f]) f.close() has_0_female_name = [] has_1_female_name = [] for actor_list in raw_data["Actors"]: if actor_list == "N/A": # again this issue only comes up twice has_0_female_name.append(0) has_1_female_name.append(0) else: actor_clean = re.sub("[^\w\s]", " ", actor_list).lower().strip() actor_names = re.split("\s+", actor_clean) female_name_count = 0 for actor_name in actor_names: if actor_name in female_nameset: female_name_count += 1 if female_name_count == 0: has_0_female_name.append(1) has_1_female_name.append(0) elif female_name_count == 1: has_0_female_name.append(0) has_1_female_name.append(1) else: has_0_female_name.append(0) has_1_female_name.append(0) data["Actress_0"] = has_0_female_name data["Actress_1"] = has_1_female_name data.to_csv(csv_out, index=False) if verbose: print("Feature generation complete, output to %s." % csv_out)
} # the database is a simple dictionnary database = {} # regular expression for: extract words, extract ID from path, check for hexa value chars = re.compile(r'\W+') atLeast3Chars = re.compile(r'\w{3,}') notDigit = re.compile(r'\D*') pattid = re.compile(r'(\d{3})/(\d{3})/(\d{3})') # the higher ID tokens = 0 documents = 0 terms = 0 stopWordsFound = 0 stemmer = PorterStemmer() # # We will create a term object for each unique instance of a term # class Term(): termid = 0 termfreq = 0 docs = 0 docids = {} # The code added: # =================================================================== # Calculate the inverse document frequency # ===================================================================