def get_guitars(text_file, guitar_filename=None): """ Module to find the most popular words from a text file""" text_data = open(text_file, "r") # Finding all the occurances of 'guitar' and its preceiding words guitars = re.findall(r"\w+ guitar", text_data.read().lower()) guitars_no_duplication = set(guitars) if guitar_filename: guitar_file = open(guitar_filename, "w") stop_words = get_stopwords() _digits = re.compile("\d") count = 0 for guitar in guitars_no_duplication: # Ignore if the word preceiding guitar is a stop_word guitar_type = guitar.split()[0] if guitar_type in stop_words: continue if guitar_type in IGNORE_WORDS: continue # Ignore if the preceiding word has digits if bool(_digits.search(guitar_type)): continue if guitar_filename: guitar_file.write(guitar + "\n") count += 1 print guitar print "The total number of types of guitars are %d" % (count)
def store_stop_words(data_file): """ Acquiring stop words from deals data""" stop_words_list = get_stopwords() data = open(data_file,'r') data_read = data.readlines() x2 = set() for line in data_read: for word in line.split(): if word in stop_words_list: x2.add(word) data.close() print "writing" output = open(STORE_WORDS_FILE,'w') for elm in x2: output.write( elm + '\n') output.close()