def read_file(document, extend): tf_by_docid = {} atid = {} os.chdir(document) for file in glob.glob(extend): f = open(file, 'r', encoding='utf-8', errors='ignore') for line in f: line_object = json.loads(line) tweet_text = line_object['text'] # if '#' in tweet_text: # atid= file+ line_object['id_str'] text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet_text) # text = re.sub(r"@\S+", "",text) # print(text) tweet_id = file + line_object['id_str'] parsedTokenList = parseTokensFromText(text) # only consider nouns # parsedTokenList=nltk.pos_tag(parseTokensFromText(text)) # print(parsedTokenList) tf_by_docid[tweet_id] = tf = {} for term in parsedTokenList: # if term[1]=='NN': tf[term] = tf.get(term, 0) + 1 # print(tf_by_docid) return tf_by_docid, atid
def read_file(document, extend): tf_by_docid = {} atid = {} os.chdir(document) for file in glob.glob(extend): f = open(file, 'r', encoding='utf-8', errors='ignore') for line in f: line_object = json.loads(line) tweet_text = line_object['text'] # if '#' in tweet_text: # atid= file+ line_object['id_str'] text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet_text) # text = re.sub(r"@\S+", "",text) # print(text) tweet_id = file + line_object['id_str'] parsedTokenList = parseTokensFromText(text) # only consider nouns # parsedTokenList=nltk.pos_tag(parseTokensFromText(text)) # print(parsedTokenList) tf_by_docid[tweet_id] = tf = {} for term in parsedTokenList: # if term[1]=='NN': tf[term] = tf.get(term, 0) + 1 # print(tf_by_docid) return tf_by_docid, atid
def calculate_query(dic): query_by_number = {} for num in sorted(dic.keys()): parsedQuery = parseTokensFromText(dic[num]) # if qe == 1: # query=wordNet(parsedQuery) query = parsedQuery # print(parsedQuery) didn't record every query's term frequency query_by_number[num] = qf = {} for term in query: #for term in parsedQuery: qf[term] = qf.get(term, 0) + 1 for term in query: qf[term] = math.log(qf[term] + 1) return query_by_number
def calculate_query(dic): query_by_number={} for num in sorted(dic.keys()): parsedQuery=parseTokensFromText(dic[num]) # if qe == 1: # query=wordNet(parsedQuery) query=parsedQuery # print(parsedQuery) didn't record every query's term frequency query_by_number[num]=qf={} for term in query: #for term in parsedQuery: qf[term]= qf.get(term,0) + 1 for term in query: qf[term]=math.log(qf[term]+1) return query_by_number