コード例 #1
0
def get_guitars(text_file, guitar_filename=None):
    """ Module to find the most popular words from a text file"""
    text_data = open(text_file, "r")
    # Finding all the occurances of 'guitar' and its preceiding words
    guitars = re.findall(r"\w+ guitar", text_data.read().lower())
    guitars_no_duplication = set(guitars)
    if guitar_filename:
        guitar_file = open(guitar_filename, "w")
    stop_words = get_stopwords()
    _digits = re.compile("\d")
    count = 0
    for guitar in guitars_no_duplication:
        # Ignore if the word preceiding guitar is a stop_word
        guitar_type = guitar.split()[0]
        if guitar_type in stop_words:
            continue
        if guitar_type in IGNORE_WORDS:
            continue

        # Ignore if the preceiding word has digits
        if bool(_digits.search(guitar_type)):
            continue

        if guitar_filename:
            guitar_file.write(guitar + "\n")
        count += 1
        print guitar
    print "The total number of types of guitars are %d" % (count)
コード例 #2
0
def store_stop_words(data_file):
    """ Acquiring stop words from deals data"""    
    stop_words_list = get_stopwords()
    
    data = open(data_file,'r')
    data_read = data.readlines()
    x2 = set()
    for line in data_read:
        for word in line.split():
            if word in stop_words_list:
                x2.add(word)
    data.close()
    print "writing"
    output = open(STORE_WORDS_FILE,'w')
    for elm in x2:
        output.write( elm + '\n')
             
    output.close()