def read_file(document, extend):
    tf_by_docid = {}
    atid = {}
    os.chdir(document)
    for file in glob.glob(extend):
        f = open(file, 'r', encoding='utf-8', errors='ignore')
        for line in f:
            line_object = json.loads(line)
            tweet_text = line_object['text']
            #    if '#' in tweet_text:
            #            atid= file+ line_object['id_str']
            text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)",
                          " ", tweet_text)
            #  text = re.sub(r"@\S+", "",text)
            #      print(text)
            tweet_id = file + line_object['id_str']
            parsedTokenList = parseTokensFromText(text)
            # only consider nouns
            #    parsedTokenList=nltk.pos_tag(parseTokensFromText(text))
            # print(parsedTokenList)
            tf_by_docid[tweet_id] = tf = {}
            for term in parsedTokenList:
                #            if term[1]=='NN':
                tf[term] = tf.get(term, 0) + 1

#  print(tf_by_docid)
    return tf_by_docid, atid
def read_file(document, extend):
    tf_by_docid = {}
    atid = {}
    os.chdir(document)
    for file in glob.glob(extend):
        f = open(file, 'r', encoding='utf-8', errors='ignore')
        for line in f:
            line_object = json.loads(line)
            tweet_text = line_object['text']
            #    if '#' in tweet_text:
            #            atid= file+ line_object['id_str']
            text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)",
                          " ", tweet_text)
            #  text = re.sub(r"@\S+", "",text)
            #      print(text)
            tweet_id = file + line_object['id_str']
            parsedTokenList = parseTokensFromText(text)
            # only consider nouns
            #    parsedTokenList=nltk.pos_tag(parseTokensFromText(text))
            # print(parsedTokenList)
            tf_by_docid[tweet_id] = tf = {}
            for term in parsedTokenList:
                #            if term[1]=='NN':
                tf[term] = tf.get(term, 0) + 1

#  print(tf_by_docid)
    return tf_by_docid, atid
def calculate_query(dic):
    query_by_number = {}
    for num in sorted(dic.keys()):
        parsedQuery = parseTokensFromText(dic[num])
        # if qe == 1:
        #     query=wordNet(parsedQuery)
        query = parsedQuery
        #        print(parsedQuery) didn't record every query's term frequency
        query_by_number[num] = qf = {}
        for term in query:
            #for term in parsedQuery:
            qf[term] = qf.get(term, 0) + 1
        for term in query:
            qf[term] = math.log(qf[term] + 1)
    return query_by_number
def calculate_query(dic):
    query_by_number={}
    for num in sorted(dic.keys()):
        parsedQuery=parseTokensFromText(dic[num])
       # if qe == 1:
       #     query=wordNet(parsedQuery)
        query=parsedQuery
#        print(parsedQuery) didn't record every query's term frequency
        query_by_number[num]=qf={}
        for term in query:
        #for term in parsedQuery:
            qf[term]= qf.get(term,0) + 1
        for term in query:
            qf[term]=math.log(qf[term]+1) 
    return query_by_number