Exemple #1
0
 def __init__(self, lyrics_file, artist_name):
     """Reads previously downloaded json and processes its content to a dataframe"""
     df = pd.read_json(lyrics_file)
     df.rename(columns={"name": "album"}, inplace=True)
     df["year"] = pd.to_numeric(df["year"], downcast="integer")
     df = df.dropna()
     df["decade"] = df["year"] - df["year"] % 10
     df["decade"] = df["decade"].astype("int")
     df.drop("URL", axis=1, inplace=True)
     df.sort_values(["year", "song"], inplace=True)
     df.reset_index(drop=True, inplace=True)
     df.drop_duplicates(subset="lyrics", keep="first", inplace=True)
     df["song"] = df["song"].str.replace(
         r"(^{0} - )|( Lyrics$)".format(artist_name.title()), "")
     df["lyrics"] = df["lyrics"].str.replace("won't", "will not")
     df["lyrics"] = df["lyrics"].str.replace("can't", "can not")
     df["lyrics"] = df["lyrics"].str.replace("n't", " not")
     df["lyrics"] = df["lyrics"].str.replace("'m", " am")
     df["lyrics"] = df["lyrics"].str.replace("'re", " are")
     df["lyrics"] = df["lyrics"].str.replace("'ll", " will")
     df["lyrics"] = df["lyrics"].str.replace("'s", " is")
     df["lyrics"] = df["lyrics"].str.replace("'ve", " have")
     df["lyrics"] = df["lyrics"].str.replace(r"(\w+\s)\1+", r"\1")
     df["lyrics"] = df["lyrics"].str.replace(r"(\s\w+)\1+", r"\1")
     self.data = df
     self.data["word_count"] = self.data.lyrics.apply(
         lambda x: len(list(tokenize(remove_stopwords(x), lower=True))))
     self.data["unique_words"] = self.data.lyrics.apply(
         lambda x: len(set(list(tokenize(remove_stopwords(x), lower=True)))))
def prepare_index(doc_path):
    """
    Presist dictionary, corpus, and index into disk
    So they can be reused later on
    """
    with open(doc_path) as input_f:
        file_name, _ = os.path.splitext(doc_path)
        raw_syllabus = input_f.read().replace("\n", "")
        documents = [remove_stopwords(raw_syllabus)]
        texts = [[
            word for word in document.lower().split() if word not in STOP_LIST
        ] for document in documents]

        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1

        texts = [[token for token in text if frequency[token] > 1]
                 for text in texts]

        dictionary = corpora.Dictionary(texts)
        dictionary.save("{}.dict".format(file_name))

        corpus = [dictionary.doc2bow(text) for text in texts]
        corpora.MmCorpus.serialize("{}.mm".format(file_name), corpus)

        lsi = LsiModel(corpus, id2word=dictionary, num_topics=1)
        index = similarities.MatrixSimilarity(lsi[corpus])
        index.save("{}.index".format(file_name))
def clean_data(text):    
    text = re.sub('@[\w]*', '', text)   # remove @user
    text = re.sub('&','',text)             # remove &
    text = re.sub('[?!.;:,,#@-]', '', text)  # remove special characters
    text = re.sub(r'[^\x00-\x7F]+', '', text) # remove Unicode characters
    text = text.replace("[^A-Za-z]", "") # Replace everything except alphabets
    text = text.lower() # make everything lowercase for uniformity    
    # removing stop-words eg. 'we', 'our', 'ours', 'ourselves', 'just', 'don', "don't", 'should'
    text = remove_stopwords(text)    
    return text
Exemple #4
0
def clean_data(text):    
    text = re.sub('@[\w]*', '', text)   # remove @user
    text = re.sub('&','',text)             # remove &
    text = re.sub('[?!.;:,,#@-]', '', text)  # remove special characters
    text = re.sub(r'[^\x00-\x7F]+', '', text) # remove Unicode characters
    text = text.replace("[^A-Za-z#]", "") # Replace everything except alphabets and hash
    text = text.lower() # make everything lowercase for uniformity    
    # removing short words which are of length 3 or lower(eg. hmm, oh) since they dont add any value
    text = " ".join(w for w in text.split() if len(w)>3)    
    # removing stop-words eg. 'we', 'our', 'ours', 'ourselves', 'just', 'don', "don't", 'should'
    text = remove_stopwords(text)    
    return text
def compute_collection_frequency(query_dict, doc_dict):
    for qno, query in query_dict.items():
        for qterm in remove_stopwords(query).split():
            term_count = 0
            qterm_stem = stemmer.stem(qterm.lower().strip())
            # print('qterm : ', qterm_stem)
            for docid, doc in doc_dict.items():
                for term in doc.split():
                    if term == qterm_stem:
                        term_count += 1
            # print('term count : ', qterm_stem, '\t', term_count)
            collection_freq_dict[qterm_stem] = term_count
    return collection_freq_dict
Exemple #6
0
def jaccard_similarity(initial_dict, var_dict):
    stemmer = PorterStemmer()
    for qid in initial_dict:
        similarity_list = []
        initial = remove_stopwords(initial_dict[qid])
        initial_stem = stemmer.stem(initial.lower().strip())
        initial_set = set(initial_stem.split())
        # print("initial set : ", initial_set)
        variant_list = var_dict[qid]
        # print("var list : ", variant_list)
        for var in variant_list:
            # print("one var : ", var)
            variant = remove_stopwords(var)
            variant_stem = stemmer.stem(variant.lower().strip())
            variant_set = set(variant_stem.split())
            # print("var set : ", variant_set)
            intersec = initial_set.intersection(variant_set)
            # print("intersection : ", intersec)
            similarity = round(
                float(len(intersec)) /
                (len(initial_set) + len(variant_set) - len(intersec)), 4)
            # print("similarity : ", similarity)
            similarity_list.append(similarity)
        query_similarity[qid] = np.array(similarity_list).astype(float)
def compute_document_frequency(query_dict, doc_dict):
    for qno, query in query_dict.items():
        for qterm in remove_stopwords(query).split():
            qterm_stem = stemmer.stem(qterm.lower().strip())
            print('qterm : ', qterm_stem)
            for docid, doc in doc_dict.items():
                flag = 1
                for term in doc.split():
                    if term == qterm_stem and flag == 1 and document_freq_dict.get(term) != 0:
                        if document_freq_dict.get(term) is None:
                            document_freq_dict[term] = 0
                        document_freq_dict[term] += 1
                        flag = 0
            document_freq_dict[term] = 0
            print('doc freq : ', qterm_stem, '\t', document_freq_dict[qterm_stem])
    return document_freq_dict
Exemple #8
0

def writeFile(anyList, str):
    w = csv.writer(open(str, 'w'))
    for key, value in anyList.items():
        w.writerow([key, value])


with open('sms-spam-corpus.csv', newline='') as f:
    reader = csv.DictReader(f, delimiter=',')
    for row in reader:
        if row['v1'] == 'spam':

            slist.append(
                stemSentence(
                    remove_stopwords(
                        re.sub(r'[^A-Za-z]+', r' ', row['v2']).lower())))

        else:

            hlist.append(
                stemSentence(
                    remove_stopwords(
                        re.sub(r'[^A-Za-z]+', r' ', row['v2']).lower())))

    #print(slist)
    #print(hlist)
    amountOfHamSentence = len(hlist)
    amountOfSpamSentence = len(slist)
    all_sentence = amountOfHamSentence + amountOfSpamSentence
    P_Ham = amountOfHamSentence / all_sentence
    P_Spam = amountOfSpamSentence / all_sentence
Exemple #9
0
    return counts


def writeFile(anyList, str):

        w = csv.writer(open(str, 'w'))
        for key, value in anyList.items():
            w.writerow([key, value])


with open('sms-spam-corpus.csv', newline='') as f:
    reader = csv.DictReader(f, delimiter=',')
    for row in reader:
        if row['v1'] == 'spam':

            slist.append(stemSentence(remove_stopwords(re.sub(r'[^A-Za-z]+', r' ', row['v2']).lower())))

        else:

            hlist.append(stemSentence(remove_stopwords(re.sub(r'[^A-Za-z]+', r' ', row['v2']).lower())))

    # print(slist)
    # print(hlist)
    sDict = word_count(slist)
    hDict = word_count(hlist)
   # print(sDict)
    #print(hDict)
field_names = ['word', 'count']
writeFile(sDict,'mycsvfile.csv')
writeFile(hDict,'mycsvfile1.csv')
Exemple #10
0
                # print("VECTOR : ", trec_corpus)
            # else:
            #     trec_corpus[parts[0]].append(vectors.vocab[w].index)
            else:
                oov += 1

trec_text_collection = TextCollection(trec_text_collection_data)
print('all ', count, ' docs loaded')
print('total ', oov, ' no. of words are not included')

# load topics file
trec_topics = {}  # topic -> list of query term vector ids
max_topic_word_count = 0
with open(arg_topics_file, 'r') as inputFile:
    for line in inputFile:
        line = remove_stopwords(line)
        parts = line.split(' ', 1)

        if parts[0] not in trec_topics:
            trec_topics[parts[0]] = []

        for w in parts[1].split(' '):
            # w = w.strip()
            ws = stemmer.stem(w.lower().strip())  # for stemming query terms
            # print("QUERY : ", ws)
            # ws = w.strip()     # if query terms should be unstemmed
            if ws in vectors.vocab:
                trec_topics[parts[0]].append(vectors.vocab[ws].index)
            else:
                print(ws, ' -- not in .vec')