コード例 #1
0
ファイル: idfResult.py プロジェクト: siaoshan33/IR
def createIDFFile():
    fname = "./initialResult/idfResult.txt"
    res = os.path.isfile(fname)
    if res:
        pass
    else:
        f = open(fname, 'w')
        dIDFs = []
        allDictionary = dictionary.getDictionary()
        jieba.load_userdict("./data/dictionary_utf8.txt")
        fileList = getFileList.getFilesListFromFile()
        inde = 1
        for sub in allDictionary:
            dIDF = {}
            dIDFsub = 0
            # print(sub, seg_list, "\n")
            pattern = re.compile(sub)
            for fv in fileList:
                result1 = pattern.findall(fv)
                if len(result1) > 0:
                    dIDFsub += 1
            idfNum = dIDFsub
            strWrite = str(idfNum)
            # strWrite = sub + " "  + str(idfNum)
            f.write(strWrite + "\n")
            print(inde)
            inde += 1
        f.close()
コード例 #2
0
ファイル: idfResult.py プロジェクト: siaoshan33/IR
def createIDFFile():
    allDocumentNumber = 2265.0
    fname = "./initialResult/idfResult.txt"
    res = os.path.isfile(fname)

    if res:
        # print fname + ' file has exists.'
        pass
    else:
        fileList = getFileList.getIntsFromFile()
        f = open(fname, 'w')
        dIDFs = []
        allDictionary = dictionary.getDictionary()
        # fileList = getFileList.getFilesListFromFile()
        for sub in allDictionary:
            dIDF = {}
            dIDFsub = 0

            for fv in fileList:
                if sub in fv:
                    dIDFsub = dIDFsub + 1
            idfNum = allDocumentNumber / dIDFsub
            strWrite = str(idfNum)

            f.write(strWrite + "\n")
        f.close()
コード例 #3
0
def createDocumentTF():
    allDictionary = dictionary.getDictionary()
    fileList = getFileList.getFilesListFromFile()

    fname = './initialResult/documentTFResult.txt'
    res = os.path.isfile(fname)
    if res:
        pass
    else:
        f = open(fname, 'w')
        for sub in allDictionary:
            strWrite = sub + " "
            for fv in fileList:
                strWrite = strWrite + " " + str(fv.split().count(sub))
            f.write(strWrite + "\n")
        f.close()
コード例 #4
0
ファイル: queryTF.py プロジェクト: 0lidaxiang/IR
def createQueryTFFile():
    allDictionary = dictionary.getDictionary()
    fname = './initialResult/queryTF.txt'
    res = os.path.isfile(fname)

    if res:
        pass
    else:
        tempQueryList = getQuerysList.getIntsFromFile()
        f = open(fname, 'w')
        for sub in allDictionary:
            strWrite = str(sub)
            for v in tempQueryList:
                c_wd = v.count(sub)
                strWrite += " " + str(c_wd)
            f.write(strWrite + "\n")
        f.close()
コード例 #5
0
ファイル: queryTF.py プロジェクト: siaoshan33/IR
def createQueryTFFile():
    allDictionary = dictionary.getDictionary()
    fname = './initialResult/queryTF.txt'
    res = os.path.isfile(fname)

    if res:
        pass
    else:
        tempQueryList = queryList.getQueryFilesList()
        f = open(fname, 'w')
        for sub in allDictionary:
            strWrite = sub
            for v in tempQueryList:
                strWrite = strWrite + " " + str(
                    v["content"].split().count(sub))
            f.write(strWrite + "\n")
        f.close()
コード例 #6
0
def createDocumentTF():
    allDictionary = dictionary.getDictionary()
    fileList = getFileList.getIntsFromFile()

    fname = './initialResult/documentTFResult.txt'
    res = os.path.isfile(fname)
    if res:
        pass
    else:
        f = open(fname, 'w')
        for sub in allDictionary:
            strWrite = ""
            docIndex = 0
            for fv in fileList:
                c_wd = fv.count(sub)
                # if c_wd > 0:
                strWrite += " " + str(c_wd)
                # strWrite += " " + str(docIndex) + ":" + str(c_wd)
                docIndex += 1
            f.write(strWrite + "\n")
        f.close()
コード例 #7
0
def getDict_DocCountWord():
    fname = "./initialResult/dict_DocCountWord.txt"
    res = os.path.isfile(fname)
    result = []
    if res:
        with open(fname) as f:
            lines = f.read().splitlines()
        for line in lines:
            tempList = []
            ll = line.split(" ")
            # del(ll[-1])
            for item in ll:
                tempList.append(list(map(int, item.split(":"))))
            result.append(tempList)
    else:
        with open('./initialResult/wordNumber.txt') as f:
            lines = f.read().splitlines()
        f = open(fname, 'w')
        createWordDocCount()
        allDictionary = dictionary.getDictionary()

        for word in allDictionary:
            strWrite = ""
            doc_index = 0
            for line in lines:
                strTemp = ''.join(line.split("\r\n"))
                aDocWordCountList = strTemp.split(" ")
                del (aDocWordCountList[-1])

                for value in aDocWordCountList:
                    results = value.split(":")
                    if results[0] == word:
                        strWrite = strWrite + str(
                            doc_index) + ":" + results[1] + " "
                doc_index = doc_index + 1
            f.write(strWrite + "\r\n")
        f.close()
    return result
コード例 #8
0
from keras.models import Model
from keras.layers import Input, Dense
from datetime import datetime
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from matplotlib import pyplot

import dictionary
import getFileList
import getDocL

k = 1  # context windows size
wordsNum = 13290

docsL = getFileList.getIntsFromFile()
dic = dictionary.getDictionary()


def getAllWordsNum():
    all_words_num = 0
    for doc in docsL:
        all_words_num += len(doc)
    return all_words_num


all_words_num = getAllWordsNum()  # 393021
input_size = all_words_num - 2 * k - 1

# initial input
left_Input = np.zeros(shape=(input_size, 1))
right_Input = np.zeros(shape=(input_size, 1))
コード例 #9
0
ファイル: computeAqueryForADoc.py プロジェクト: siaoshan33/IR
# coding: utf-8
import os
import math
import numpy as np

import queryList
import dictionary
import documentTF
import queryTF
import idfResult
import getFileList

allDictionary = dictionary.getDictionary()
wordNumber = len(allDictionary)
idfList = idfResult.getIDF()
numberOfDoc = 2265

docTF = documentTF.getDocumentTF()
querTF = queryTF.getQueryTF()
fileNameList = getFileList.getFileNameList()


def computeAquery(queryIndex):
    lineNum1 = 0
    qtfL = []
    idfL = []
    while lineNum1 < wordNumber:
        idfL.append(map(float, idfList[lineNum1].split())[1])
        qtfL.append(map(float, querTF[lineNum1].split())[queryIndex])

        lineNum1 += 1
コード例 #10
0
ファイル: base.py プロジェクト: siaoshan33/IR
#!/usr/bin/python3
# coding: utf-8

import numpy as np
import math
from datetime import datetime

import dictionary
import document
import wordDocsCount
import wordDocsNoCount

startInitial = datetime.now()

docList = document.getFilesName()
wordList = dictionary.getDictionary()
wNumber = len(wordList)
dNumber = len(docList)  # 2265
topicNum = 10
P_d = np.random.dirichlet(np.ones(dNumber), size=1).tolist()[0]
P_w_T = np.random.dirichlet(np.ones(topicNum), size=wNumber)
P_T_d = np.random.dirichlet(np.ones(dNumber), size=topicNum)
P_T_wd = np.zeros(shape=(topicNum * dNumber, wNumber))
count_w_d = wordDocsCount.getWordDocCount()
noCount_w_d = wordDocsNoCount.getWordDocNoCount()
dict_DocCountWord = wordDocsCount.getDict_DocCountWord()


def get_BG():
    fname = "./BGLM.txt"
    with open(fname) as f:
コード例 #11
0
# TWITTER API BUILDING
auth = OAuthHandler(keys.getCK(), keys.getCS())
auth.set_access_token(keys.getAT(), keys.getAS())
api = tweepy.API(auth)

# INITIALSTATE API BUILDING
streamer = Streamer(bucket_name=keys.getBN(),
                    bucket_key=keys.getBK(),
                    access_key=keys.getAK())

# EMOJIS
emCodes = []
emNames = []
e = 'emojis'

emDict = getDictionary()
for key, value in emDict.iteritems():
    emCodes.append(value['code'])
    emNames.append(value['id'])


class MyListener(StreamListener):
    def on_data(self, data):
        d = json.loads(data)
        if 'text' in d:
            txt = d['text']

            for emoj in emCodes:
                if emoj in txt:
                    streamer.log(e, emNames[emCodes.index(emoj)])
            # HANDLES LOCATION (IF ANY)