def get_music_bio(params):
    files = pd.read_csv(params['csv_path']).to_dict('resutls')
    params['D'] = params['D'] if params['D'] else len(files)
    all_word_counts = {}
    for f in files[:params['D']]:
        line = f['content']
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
            for word in s:
                if word not in all_word_counts:
                    all_word_counts[word] = 0
                else:
                    all_word_counts[word] += 1
    params['V'] = params['V'] if params['V'] else len(all_word_counts)
    V = min(params['V'], len(all_word_counts))
    all_word_counts_idx = all_word_counts
    all_word_counts = sorted(all_word_counts.items(), key=lambda x: x[1], reverse=True)
    top_words = [w for w, count in all_word_counts[:V-1]] + ['<UNK>']
    word2idx = {w:i for i, w in enumerate(top_words)}
    all_word_counts_idx = {ind: all_word_counts_idx[w] if w != '<UNK>' else 0 for ind, w in enumerate(word2idx)}
    print("finished counting")
    unk = word2idx['<UNK>']
    sents = []
    sentences = []
    for f in files[:params['D']]:
        content = f['content']
        for sentence in content.split("."):
            sentence = remove_punctuation(sentence).lower()
            if len(sentence.split()) > 1:
                sent = [word2idx[w] if w in word2idx and w != ' ' else unk for w in sentence.split()]
                sentences.append(sentence)
                sents.append(sent)
    return sentences, sents, word2idx, all_word_counts_idx, params
Exemple #2
0
def main():
    #This file contains task 1.1 - 1.6

    f = codecs.open(text_file, "r", "utf-8")
    paragraphs = functions.makeParagraphArray(f)

    #Removes "gutenberg" and makes a copy of the paragraph
    paragraphs = functions.remove_specific_word("Gutenberg", paragraphs)
    paragraphs = functions.remove_specific_word("gutenberg", paragraphs)
    par_copy = copy.copy(paragraphs)

    paragraphs = functions.tokenize(paragraphs)
    paragraphs = functions.remove_punctuation(paragraphs)
    paragraphs = functions.stem(paragraphs)

    return par_copy, paragraphs
def main(sc, argv): 
    filename = argv[1]
    # threshold = int(argv[2])

    dfTextFile = sc.read.text(filename)
    wordCount = dfTextFile \
                 .select(explode(split(dfTextFile.value, ' ')).alias('word')) \
                 .transform(udfStr.remove_punctuation('word')) \
                 .groupBy('word') \
                 .count() \
                 .collect()
    print('-' * 50)
    # wordCount.select('word').show()

    for w in sorted(wordCount, key=lambda x: x[1]):
        print(w)

    print('-' * 50)
def remove_test():
    assert remove_punctuation('!!!Hello!@#?') == 'Hello'
def test_remove_punctuation():
    assert callable(remove_punctuation)
    assert remove_punctuation("hEllO,hOware!yOU") == "hellohowareyou"