Example #1
0
def sparse_sentence(s):
    #print s
    s_sparse =\
    mecab.sparse_all(s.encode("utf-8"),"/usr/lib/libmecab.so.1").split("\n")[:-2]
    candidate = set()
    for s2 in s_sparse: # この時点で単語レベルのハズ(ただしs2=単語 品詞
                        # とかかなぁ
        #print "s2",
        s3 = s2.decode("utf-8").split("\t")
        s4 = s3[1].split(",")
        #print s3[0],s4[0]
        if s4[0] != u"記号" and s4[0] != u"助動詞" \
            and s4[0] != u"助詞":#数が集まったら名詞のみにしたい
            candidate.add(s3[0])
    return candidate
Example #2
0
def sparse_sentence(s):
    #print s
    s_sparse =\
    mecab.sparse_all(s.encode("utf-8"),"/usr/lib/libmecab.so.1").split("\n")[:-2]
    candidate = set()
    for s2 in s_sparse:  # この時点で単語レベルのハズ(ただしs2=単語 品詞
        # とかかなぁ
        #print "s2",
        s3 = s2.decode("utf-8").split("\t")
        s4 = s3[1].split(",")
        #print s3[0],s4[0]
        if s4[0] != u"記号" and s4[0] != u"助動詞" \
            and s4[0] != u"助詞":#数が集まったら名詞のみにしたい
            candidate.add(s3[0])
    return candidate
Example #3
0
def analyze():
    # dbからデータを読み込む
    userdata = getAuthData(conf_path)
    dbSession = model.startSession(userdata)
    # 前回の更新時間から現在までのデータを入手する
    q = dbSession.query(model.Tweet)

    # ToDo:ここ、1000件ずつ取って、一定件数溜まったらDBに書き込むように変えられないか?

    insertData = defaultdict(int)
    while True:
        tq = q.filter(model.Tweet.isAnalyze == 1)[:1000]
        i = 0
        if len(tq) == 0:
            break
        for t in tq:
            # 1発言毎
            t.text = removeCharacter(t.text)
            t.isAnalyze = 2
            t_enc = t.text.encode(g_mecabencode, "ignore")
            sarray = mecab.sparse_all(t_enc, mecabPath).split("\n")
            sarray2 = connectUnderScore(sarray)
            markovWordList, topNWordList = takeWordList(sarray2)
            print len(markovWordList)

            # 最近出た名詞貯める
            dbSession.add(t)
            appendMarkov(markovWordList, dbSession, insertData)
            # appendCollocation(markovWordList,dbSession)
            i += 1
            if i >= 1000:
                insertMarkovData2DB(dbSession, insertData)
                dbSession.commit()
                insertData = defaultdict(int)
                i = 0

    if len(insertData) > 0:
        insertMarkovData2DB(dbSession, insertData)
        dbSession.commit()
Example #4
0
def analyze():
    # dbからデータを読み込む
    userdata = getAuthData(conf_path)
    dbSession = model.startSession(userdata)
    # 前回の更新時間から現在までのデータを入手する
    q = dbSession.query(model.Tweet)

    # ToDo:ここ、1000件ずつ取って、一定件数溜まったらDBに書き込むように変えられないか?

    insertData = defaultdict(int)
    while (True):
        tq = q.filter(model.Tweet.isAnalyze == 1)[:1000]
        i = 0
        if len(tq) == 0: break
        for t in tq:
            #1発言毎
            t.text = removeCharacter(t.text)
            t.isAnalyze = 2
            t_enc = t.text.encode(g_mecabencode, 'ignore')
            sarray = mecab.sparse_all(t_enc, mecabPath).split("\n")
            sarray2 = connectUnderScore(sarray)
            markovWordList, topNWordList = takeWordList(sarray2)
            print len(markovWordList)

            #最近出た名詞貯める
            dbSession.add(t)
            appendMarkov(markovWordList, dbSession, insertData)
            #appendCollocation(markovWordList,dbSession)
            i += 1
            if i >= 1000:
                insertMarkovData2DB(dbSession, insertData)
                dbSession.commit()
                insertData = defaultdict(int)
                i = 0

    if len(insertData) > 0:
        insertMarkovData2DB(dbSession, insertData)
        dbSession.commit()