コード例 #1
0
ファイル: naives.py プロジェクト: LoveToCommit/nlp-1
def all_words(lyrics):
    for l in lyrics:
        for w in nltk.word_tokenize(" ".join(l["lyrics"])):
            #             print w, unigram_tagger.tag([w])
            if l.get("title") and (not is_stopword(w)) and valid(
                    unigram_tagger.tag([w])[0][1]):
                yield w.lower()
コード例 #2
0
ファイル: tfidf.py プロジェクト: Big-Data/nlp
def add_info(lyric, text_col, i=0):

    def prep_word(w):
        return w.lower()
    
    a = dict(lyric)
    a["lyrics"] = Text(w.lower() for w in nltk.word_tokenize(" ".join(a["lyrics"]).encode("utf-8")))
    a["terms"] = set(w for w in a["lyrics"] if not is_stopword(w) and valid(w))
    
    a["tf_idf"] = dict(sorted( ( (t, text_col.tf_idf(t, a["lyrics"])) for t in a["terms"] ), key=lambda x : x[1]) )
    i += 1
    print i
    return a
コード例 #3
0
def add_info(lyric, text_col, i=0):
    def prep_word(w):
        return w.lower()

    a = dict(lyric)
    a["lyrics"] = Text(
        w.lower()
        for w in nltk.word_tokenize(" ".join(a["lyrics"]).encode("utf-8")))
    a["terms"] = set(w for w in a["lyrics"] if not is_stopword(w) and valid(w))

    a["tf_idf"] = dict(
        sorted(((t, text_col.tf_idf(t, a["lyrics"])) for t in a["terms"]),
               key=lambda x: x[1]))
    i += 1
    print i
    return a
コード例 #4
0
ファイル: tfidf.py プロジェクト: Big-Data/nlp
def add_info(lyric, text_col, i=0):

    def prep_word(w):
        return w.lower()
    
    a = dict(lyric)
    a["lyrics"] = Text(w.lower() for w in nltk.word_tokenize(" ".join(a["lyrics"]).encode("utf-8")))
    a["terms"] = set(w for w in a["lyrics"] if not is_stopword(w) and valid(w))
    
    a["tf_idf"] = dict(sorted( ( (t, text_col.tf_idf(t, a["lyrics"])) for t in a["terms"] ), key=lambda x : x[1]) )
    i += 1
    print i
    return a
    
ts = TextCollection([Text((w.lower() 
                                for w in nltk.word_tokenize(" ".join(l["lyrics"]).encode('utf-8')) if not is_stopword(w) and valid(w) ) , 
                         name = l["id"]) 
                    for l in lyrics if l.get("id")])

#lyrics = map(lambda x : add_info(x[1], ts, x[0]), ((i, l) for i, l in enumerate(lyrics) if l.get("id")))

#with open("withtfidf.pickle", "w") as f:
#    pickle.dump(lyrics, f)

with open("withtfidf.pickle", "r") as f:
    lyrics = pickle.load(f)
    
lyrics = dict((l["id"], l) for l in lyrics)

def similar_lyrics(lyric):
    for l in lyrics.values():
コード例 #5
0
ファイル: naives.py プロジェクト: Big-Data/nlp
def all_words(lyrics):
    for l in lyrics:
        for w in nltk.word_tokenize(" ".join(l["lyrics"])):
#             print w, unigram_tagger.tag([w])
             if l.get("title") and (not is_stopword(w)) and valid(unigram_tagger.tag([w])[0][1]):
                yield w.lower()
コード例 #6
0
        w.lower()
        for w in nltk.word_tokenize(" ".join(a["lyrics"]).encode("utf-8")))
    a["terms"] = set(w for w in a["lyrics"] if not is_stopword(w) and valid(w))

    a["tf_idf"] = dict(
        sorted(((t, text_col.tf_idf(t, a["lyrics"])) for t in a["terms"]),
               key=lambda x: x[1]))
    i += 1
    print i
    return a


ts = TextCollection([
    Text((w.lower()
          for w in nltk.word_tokenize(" ".join(l["lyrics"]).encode('utf-8'))
          if not is_stopword(w) and valid(w)),
         name=l["id"]) for l in lyrics if l.get("id")
])

#lyrics = map(lambda x : add_info(x[1], ts, x[0]), ((i, l) for i, l in enumerate(lyrics) if l.get("id")))

#with open("withtfidf.pickle", "w") as f:
#    pickle.dump(lyrics, f)

with open("withtfidf.pickle", "r") as f:
    lyrics = pickle.load(f)

lyrics = dict((l["id"], l) for l in lyrics)


def similar_lyrics(lyric):