Beispiel #1
0
 def calculate(self):
     lis = []
     for cluster in self.__data:
         # 判断是否是小于两元素,若是则不用计算
         if len(cluster) <= 2:
             result = cluster
         else:
             self.__vsm = vsm.VSM(cluster)
             result = self.__calculate_detail(cluster)
         l = []
         for r in result:
             l.append(r[0])
         lis.append(l)
     return lis
Beispiel #2
0
 def set_property(self, category, appId):
     self.__category = category
     self.__appId = appId
     self.__data = self.__get_data()
     # 初始化一个vsm对象
     self.__vsm = vsm.VSM(self.__data)
    correct = 0
    precision = []
    print(lt_gen)
    for i in lt_gen:
        a += 1
        if i in lt_ground_truth:
            correct += 1
            print('c', correct, 'a', a)
            precision.append(correct / a)
    MAP = sum(precision) / len(lt_ground_truth)
    return MAP


# In[12]:

v = vsm.VSM(model_dir=corpus)

# ## Stage 1

# In[13]:

df_whole = pd.read_csv('create_corpus/task2_trainset.csv')
df_whole = df_whole.set_index('Id')
target = df_whole.loc[target_id]
target_whole_query = target['Abstract']
target_whole_query = re.sub(r'[^\w\s]', ' ',
                            target_whole_query).lower()  #.split(' ')
target_whole_query = [target_whole_query]

# In[14]:
Beispiel #4
0
    help=
    'target document id to generate query to calculate mAP, ranges from 0 to 4'
)
parser.add_argument('--model_dir',
                    type=str,
                    default='./model/',
                    help='directory of inverted file')
args = parser.parse_args()

# dim = 5000
print('---------------')
print('dim=', args.dim)
print('qlen=', args.qlen)
print('target doc id', args.tid)
print('---------------')
v = vsm.VSM(model_dir=args.model_dir, dim=args.dim)

# with open('test.json', 'w') as f:
#     json.dump(v.TF, f)
# with open('vocab.json', 'w') as f:
#     json.dump(v.vocab, f)
# print(v.vocab.shape)
# print(v.vocab[1000])
# print(v.filelist[1349])
import numpy as np


def calc_AP(ret, ans):
    AP = 0
    hit = 0
    for i, v in enumerate(ret):
Beispiel #5
0
import vsm
"""
queries
    - list
    - 可以多個query
    - 字中間以空格隔開,會處理標點符號
scores_follow_filelist 
    - 每一個document 的分數,順序是依照當前檔案的file list
descending_ranking_by_id
    - 排名,數字代表當前檔案的file list 的第幾個檔案
    - 逆序,排在第一位代表分數最高第一名
"""

v = vsm.VSM(model_dir='mini_corpus_1')

queries = ["1st query", "2nd query"]
# for example:
# queries = ["Adversarial Examples that Fool Detectors"]
scores_follow_filelist, descending_ranking_by_id = v.retrieval(queries)
Beispiel #6
0
        return map_scores, rs

if __name__ == '__main__':
    args = _parse_args()

    if not args.query_paths:
        if not os.path.exists(args.results_path):
            print("Neither query nor folder found!")
            exit(1)
        args.query_paths = glob.glob(args.results_path + "*.txt")

    print(args.query_paths)

    for doc_p in args.doc_path:
        print("Init VSM...")
        v = vsm.VSM(model_dir=doc_p)

        print("Calculate ranking...")
        query_lst = []
        query_docid_lst = []
        query_name_lst = []
        queries = []
        for filename in args.query_paths:
            with open(filename, 'r') as f:
                this_query_lst = f.readlines()
                for q in this_query_lst:
                    q = q.strip().split(' ') # [docID, query_text]
                    docID, query = int(q[0]), ' '.join(q[1:])
                    queries.append(query)
                    #print(docID, query)
                    if docID == -1 or args.topk: