def test_idf(self): self.assertEqual(self.index_.idf("dog"), 1.0986122886681098) self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) self.assertEqual(self.index_.document_count("first"), 0)
def info(self, info='', col_bits=5, pagenum=100): keywords = PreDeal.seg(info) # 1. 关键词提取 keys = jieba.analyse.textrank(info, topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) # 2. 调用搜索引擎爬取相关网页 # 2.1 抓取链接 spider_link = SpiderLink(keys, self.root) spider_link.crawl(pagenum) # 2.2 抓取内容 filename = '_'.join(keys) + '.html' spider_to = SpiderTo(filename) spider_to.crawl() # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合 p = PreDeal() filepath = os.path.join(config.spidertext, '_'.join(keys)) propath = os.path.join(config.prepapath, '_'.join(keys)) p.savetexts(filepath=filepath, prepath=propath) # 4. 构建索引, 并检索,得到包含关键词信息的网页 # 4.1 索引构建 indexpath = os.path.join(config.indexpath, '_'.join(keys)) Index.build(datapath=propath, indexpath=indexpath) search = Search(keys=keys, pindexp=indexpath) # 4.2 搜索并保存 search.retrieve(keywords=keywords) # 5. 选取最佳网页,位置信息描述,编码 info_kws = keywords[:] loc = Location(keywords=info_kws, col_bits=col_bits) name = '_'.join(keys) res_list = loc.describe(name) return res_list
def test_term_count(self): self.assertEqual(self.index_.term_count("dog"), 2) self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) self.assertEqual(self.index_.term_count("emotional"), 3515)
def test_document_count(self): self.assertEqual(self.index_.document_count("dog"), 1) self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) self.assertEqual(self.index_.document_count("emotional"), 2973)
class SimpleDocument: def __init__(self, parameters): self.index_ = Index(parameters) self.stop_words = set(nltk.corpus.stopwords.words('english')) def replace_stemmed_similar_words_list(self, l): i = 0 while i < len(l): j = i + 1 while j < len(l): if self.index_.check_if_have_same_stem(l[i], l[j]): l[j] = l[i] j += 1 i += 1 return l def remove_non_existent_words_in_repo(self, l): return [w for w in l if self.index_.check_if_exists_in_index(w)] def get_words(self, doc_file_name): tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') with open(doc_file_name, 'r') as f: doc_words = tokenizer.tokenize(f.read()) doc_words = [w.lower() for w in doc_words] doc_words = [w for w in doc_words if w not in self.stop_words] doc_words = self.remove_non_existent_words_in_repo(doc_words) doc_words = self.replace_stemmed_similar_words_list(doc_words) doc_words = [w for w in doc_words if w.isalpha()] doc_words = [w for w in doc_words if len(w) > 2] return doc_words
class GensimCorpus(TextCorpus): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): pass def __init__(self, parameters): self.stop_words = set(nltk.corpus.stopwords.words('english')) self.tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') self.parameters = parameters self.index_ = Index(self.parameters) self.store_collection_if_not_exists() input_ = self.parameters.params["lda"]["file_name"] super().__init__(input_) def read_collection(self): collection_lines = "" for i in range(1, self.index_.total_count() + 1): doc_text = self.index_.obtain_text_of_a_document(i) doc_words = self.tokenizer.tokenize(doc_text) doc_words = [w.lower() for w in doc_words if w.isalpha() and len(w) > 2] doc_words = [w for w in doc_words if w not in self.stop_words] doc_words = [self.index_.index.process_term(w) for w in doc_words] collection_lines += ' '.join(doc_words) + '\n' return collection_lines def store_collection_if_not_exists(self): if not os.path.exists(self.parameters.params["lda"]["file_name"]): print("reading collection...", file=sys.stderr) collection_lines = self.read_collection() print("writing collection to:", self.parameters.params["lda"]["file_name"], file=sys.stderr) with open(self.parameters.params["lda"]["file_name"], "w") as f: f.write(collection_lines)
def test_index_with_xml_libraries(self): xml_libs = os.path.join( env.RESOURCES_DIR, 'library' ) db_dir_with_xml = os.path.join( env.RESULTS_DIR, 'db_dir_with_xml') scanner = Scanner(xml_libs) scanner.scan( self.suite_dir, 'robot', db_dir_with_xml ) index = Index(db_dir_with_xml, self.index_dir, self.xml_libs) index.index_consturctor(self.resource_a_table_name) files = os.listdir(self.index_dir) self.assertEqual(len(files), 1) with open(os.path.join(self.index_dir, files[0])) as f: data = json.load(f) self.assertTrue( any(kw[2] == 'SwingLibrary' for kw in data['keyword']) ) self.assertTrue( any(kw[0] == 'Add Table Cell Selection' for kw in data['keyword']) ) self.assertTrue( any(kw[0] == 'Select From Popup Menu' for kw in data['keyword']) )
def test_index_with_xml_libraries(self): xml_libs = os.path.join( env.RESOURCES_DIR, 'library' ) db_dir_with_xml = os.path.join( env.RESULTS_DIR, 'db_dir_with_xml') scanner = Scanner(xml_libs) scanner.scan( self.suite_dir, 'robot', db_dir_with_xml ) index = Index(db_dir_with_xml, self.index_dir, self.xml_libs) index.index_consturctor(self.resource_a_table_name) files = os.listdir(self.index_dir) self.assertEqual(len(files), 1) with open(os.path.join(self.index_dir, files[0])) as f: data = json.load(f) self.assertTrue( any(kw[2] == 'SwingLibrary' for kw in data['keywords']) ) self.assertTrue( any(kw[0] == 'Add Table Cell Selection' for kw in data['keywords']) ) self.assertTrue( any(kw[0] == 'Select From Popup Menu' for kw in data['keywords']) )
def info(self, fi='', pagenum=100): info = FileUtil.readfile(fi) keywords = PreDeal.seg(info) # 1. 关键词提取 keys = jieba.analyse.textrank(info, topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) # 2. 调用搜索引擎爬取相关网页 # 2.1 抓取链接 spider_link = SpiderLink(keys, self.root) spider_link.crawl(pagenum) # 2.2 抓取内容 filename = '_'.join(keys) + '.html' spider_to = SpiderTo(filename) spider_to.crawl() # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合 p = PreDeal() filepath = os.path.join(config.spidertext, '_'.join(keys)) prepath = os.path.join(config.prepapath, '_'.join(keys)) p.savetexts(filepath=filepath, prepath=prepath) # 4. 构建索引, 并检索,得到包含关键词信息的网页 # 4.1 索引构建 indexpath = os.path.join(config.indexpath, '_'.join(keys)) idx = Index() idx.build(datapath=prepath, indexpath=indexpath) search = Search1(filename=fi, pindexp=indexpath) # 4.2 搜索并保存 info_k = keywords[:] num = search.retrieve(keywords=info_k) return keywords, num
def __init__(self, parameters): self.stop_words = set(nltk.corpus.stopwords.words('english')) self.tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') self.parameters = parameters self.index_ = Index(self.parameters) self.store_collection_if_not_exists() input_ = self.parameters.params["lda"]["file_name"] super().__init__(input_)
def test_check_if_exists_in_index(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) self.assertTrue(self.index_.check_if_exists_in_index("emotional")) self.assertFalse(self.index_.check_if_exists_in_index("first")) self.assertFalse(self.index_.check_if_exists_in_index("included")) self.assertTrue(self.index_.check_if_exists_in_index("includes"))
def query(self, keywords, kwpath=''): path = [] # 已经找到的文章列表 num = [] # 每篇含文章组合的个数 unmatch = 0 # 失配个数 maxh = 0 # 关键词个数 q = '' # 联合关键词 flag = True # 失配标志 hidekey = [] while keywords: kw = keywords[0] paper = Index.search(self.pindexp, q + ' ' + kw, limit=None) if paper: keywords.pop(0) hidekey.append(kw) q = q + ' ' + kw maxh += 1 else: # 当联合搜索无法进行下去时,转为寻找相似关键词 simikeys = WV.similarwords(kw) t_paper = [] if not simikeys: print( ".................Failed to find similar words................" ) flag = False else: for skw, similarity in simikeys: sq = q + ' ' + skw t_paper = Index.search(self.pindexp, sq, limit=None) if t_paper: hidekey.append(skw) keywords.pop(0) q = sq maxh += 1 break if not t_paper: # 有关键词但联合搜索仍失败 flag = False # 失配 if not flag: doc = Index.search(self.pindexp, q, limit=None) if not doc: print("The keyword '%s' is unMatch !" % kw) unmatch += 1 hidekey.append('0') keywords.pop(0) path.append(None) # flag = True else: path.append(doc) num.append(maxh) maxh = 0 q = '' flag = True if not keywords: path.append(paper) hide_string = ' '.join(hidekey) FileUtil.writefile(hide_string, kwpath) return path
def index_single(db_path, db_table, index_path, module_search_path, libs_in_xml): for path_ in module_search_path: sys.path.append(path_) if not path.exists(index_path): makedirs(index_path) index = Index(db_path=db_path, index_path=index_path, xml_libraries=libs_in_xml) index.index_consturctor(table=db_table)
def test_tf(self): doc_words = SimpleDocument(self.parameters).get_words( "../configs/others/pride_and_prejudice_wiki.txt") self.assertEqual(self.index_.tf("dog", doc_words), 0.5) self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) self.assertEqual(self.index_.tf("emotional", doc_words), 0.5)
def setUp(self): self.index_dir = os.path.join( env.RESULTS_DIR, 'index_dir', ) if os.path.exists(self.index_dir): while os.path.exists(self.index_dir): shutil.rmtree(self.index_dir) sleep(0.1) os.makedirs(self.index_dir) self.index = Index(self.db_dir, self.index_dir)
def test_tfidf(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) doc_words = SimpleDocument(self.parameters).get_words( "../configs/others/pride_and_prejudice_wiki.txt") tfidf_1 = self.index_.tfidf('emotional', doc_words) print(tfidf_1, file=sys.stderr) self.assertEqual(tfidf_1, 2.0455597255490345) with self.assertRaises(Exception) as context: self.index_.tfidf('is', doc_words) self.assertTrue( 'unigram "is" not exist. Probably was a stopword in indexing.' in str(context.exception))
class Original: def __init__(self, parameters): self.parameters = parameters self.enchant_dict = enchant.Dict("en_US") self.stopwords = stopwords.words('english') self.index_ = Index(self.parameters) def check_if_unigram_should_be_added(self, unigram): unigram = unigram.lower() if not unigram.isalpha(): return False if unigram in self.stopwords: return False if not self.index_.check_if_exists_in_index(unigram): return False # uncomment to include a dictionary # if not self.enchant_dict.check(unigram): # print("WARNING: \"", unigram, "\" doesn't exist in dictionary.", file=sys.stderr, end=" ") # return False return True def find_unigrams(self, text): for unigram in word_tokenize(text): if self.check_if_unigram_should_be_added(unigram): yield unigram, [(unigram, 1)]
def test_obtain_term_ids_of_a_document(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) res = self.index_.obtain_term_ids_of_a_document(1) self.assertEqual( res, ('AP881107-0001', (147, 771, 0, 78064, 26, 2828, 1283, 92, 126, 147, 175009, 159395, 771, 55, 0, 0, 2362, 26, 2828, 919, 0, 0, 115, 8, 461, 1624, 1826, 0, 35, 693, 1198, 0, 195412, 0, 724, 430, 621, 340, 0, 771, 0, 1502, 20649, 4327, 1620, 9, 247, 0, 0, 866, 0, 643, 0, 2828, 415, 101374, 1289, 2015, 276, 1246, 0, 24, 29, 586, 0, 272, 0, 856, 0, 101374, 1826, 2153, 0, 174, 693, 0, 195412, 0, 0, 158999, 1037, 0, 117013, 137162, 123, 157, 0, 4415, 159395, 0, 0, 2262, 0, 0, 56, 0, 101374, 251, 0, 0, 189, 101374, 0, 569, 0, 332, 3095, 1873, 0, 2974, 0, 0, 13, 63630, 0, 485, 461, 91464, 0, 0, 91, 0, 50405, 0, 0, 156, 159395, 0, 690, 0, 347, 0, 24, 4049, 0, 101374, 0, 0, 0, 771, 0, 0, 92, 126, 690, 131907, 609, 0, 56, 0, 88, 2222, 0, 0, 1624, 0, 160974, 0, 9, 0, 0, 436, 0, 2362, 273, 0, 774, 1620, 13, 0, 263, 0, 887, 339, 176, 0, 0, 0, 91, 0, 50405, 0, 1620, 0, 0, 0, 289, 0, 1202, 101374, 0, 254, 0, 4543, 8, 193, 91, 0, 979, 3597, 0, 3095, 0, 791, 0, 2768, 937, 0, 0, 264, 91, 0, 0, 461, 2464, 0, 9, 0, 0, 1333, 2198, 45622, 0, 4433, 0, 1624, 0, 678, 0, 0, 0, 0, 3790, 0, 40, 0, 0, 118, 135313, 1620, 727, 136295, 0, 0, 0, 3408, 0, 2362, 6, 0, 245, 0, 0, 494, 0, 415, 101374, 0, 2042, 435, 0, 0, 0, 0, 0, 586, 0, 347, 1461, 0, 0, 116, 0, 0, 354, 0, 0, 9, 145130, 0, 48310, 120515, 51, 0, 0, 1, 956, 540, 430, 32, 32, 0, 0, 0, 221, 340, 0, 9, 771, 101374, 0, 1431, 897, 123, 0, 248, 0, 0, 91, 0, 50405, 0, 919, 0, 114, 667, 0, 70387, 0, 62918, 0, 0, 101374, 261, 0, 0, 311, 212, 0, 0, 61, 0, 4322, 767, 144897, 0, 62, 0, 0, 91, 0, 50405, 0, 375, 8, 546, 0, 0, 26, 3527, 543, 3657, 3224, 0, 0, 0, 2223, 0, 533, 0, 1019, 367, 0, 674, 0, 165, 101374, 0, 0, 436, 0, 4322, 767, 144897, 0, 62, 0, 91, 0, 0, 445, 8, 0, 0, 550, 0, 0, 26, 91, 3533, 1512, 285, 0, 0, 543, 0, 0, 0, 0, 0, 192, 0, 0, 1798, 101374, 0, 328, 0, 26, 0, 2362, 45369, 1390, 0, 156, 167, 0, 0, 91, 0, 50405, 0, 791, 0, 28, 0, 0, 264, 4415, 1977, 0, 187, 15, 0, 0, 550, 1483, 273, 0, 247, 101374, 887, 0, 91, 0, 50405, 168, 0, 303, 0, 515, 34, 77704, 0, 156, 23, 701, 273, 114, 1056, 0, 0, 409, 490, 101374, 771, 897, 899, 0, 248, 0, 77704, 0, 0, 0, 0, 1502, 599, 463, 147, 415, 4327, 2512, 0, 0, 70387, 0, 91, 0, 50405, 221, 62, 0, 17, 41, 8, 1548, 0, 0, 3657, 3224, 550, 0, 0, 171, 0, 2597, 4327, 62, 4, 0, 512, 0, 0, 1, 73, 532, 7, 0, 0, 0, 192, 0, 3657, 3224, 0, 0, 0, 1548, 101374, 626, 918, 455, 0, 114, 219, 0, 0, 245, 3095, 0, 0, 333, 0)))
class TestIndexing(unittest.TestCase): """The content of the db_fir was created with scanner by scanning the TEST_DATA_DIR/suite_tree folder. If scanner is changed, db_dir must be recreated.""" @classmethod def setUpClass(cls): cls.db_dir = os.path.join(env.RESULTS_DIR, 'db_dir') cls.suite_dir = os.path.join(env.TEST_DATA_DIR, 'suite_tree') scanner = Scanner() scanner.scan(cls.suite_dir, 'robot', cls.db_dir) cls.xml_libs = os.path.join(env.RESOURCES_DIR, 'library') def setUp(self): self.index_dir = os.path.join( env.RESULTS_DIR, 'index_dir', ) if os.path.exists(self.index_dir): while os.path.exists(self.index_dir): shutil.rmtree(self.index_dir) sleep(0.1) os.makedirs(self.index_dir) self.index = Index(self.db_dir, self.index_dir) def test_parse_table_data(self): t_name = os.path.join(env.RESOURCES_DIR, 'BuiltIn-ca8f2e8d70641ce17b9b304086c19657.json') self.index.queue.add(t_name, None, None) data, status = self.index.read_table( os.path.join(env.RESOURCES_DIR, t_name)) var, kw_index = self.index.parse_table_data(data, t_name) self.assertTrue(u'${/}' in var) self.assertTrue('${OUTPUT_FILE}' in var) self.assertTrue('@{TEST_TAGS}' in var) def test_add_builtin(self): self.index.add_builtin_to_queue(self.db_dir) self.assertTrue(len(self.index.queue.queue) > 0) def test_read_table(self): data, read_status = self.index.read_table( os.path.join(self.db_dir, self.test_b_table_name)) self.assertTrue(data['file_name'], 'test_b.robot') def test_get_keywords_resource(self): data = self.get_resource_b() expected_kw_list = ['Resource B Keyword 2', 'Resource B Keyword 1'] expected_arg_list = [['kwb1'], []] kw_list, arg_list = self.index.get_keywords(data) self.assertEqual(kw_list, expected_kw_list) self.assertEqual(arg_list.sort(), expected_arg_list.sort()) data = self.get_test_a() expected_kw_list = ['Test A Keyword', 'Keyword'] kw_list, arg_list = self.index.get_keywords(data) self.assertEqual(kw_list, expected_kw_list) self.assertEqual(arg_list, [[], []]) data = self.get_s2l() parsed_kw, arg_list = self.index.get_keywords(data) self.assertTrue('Set Window Position' in parsed_kw) self.assertTrue('Get Cookies' in parsed_kw) self.assertTrue('Unselect Frame' in parsed_kw) self.assertTrue(['name'] in arg_list) l = ['driver_name', 'alias', 'kwargs', '**init_kwargs'] self.assertTrue(l in arg_list) self.assertTrue(['*code'] in arg_list) def test_get_imports(self): data = self.get_resource_b() import_list = [self.process_table_name] self.assertEqual(self.index.get_imports(data), import_list) data = self.get_test_a() import_list = [self.common_table_name, self.resource_a_table_name] self.assertEqual( self.index.get_imports(data).sort(), import_list.sort()) data = self.get_s2l() self.assertEqual(self.index.get_imports(data), []) def test_get_variables(self): data = self.get_resource_b() var = ['${RESOURCE_B}'] self.assertEqual(self.index.get_variables(data), var) data = self.get_test_a() var = ['${TEST_A}'] self.assertEqual(self.index.get_variables(data).sort(), var.sort()) data = self.get_s2l() self.assertEqual(self.index.get_variables(data), []) data = self.get_common() self.assertEqual(self.index.get_variables(data), []) def test_get_kw_for_index(self): KeywordRecord = namedtuple('KeywordRecord', 'keyword argument object_name table_name') table_name = self.resource_b_table_name l, kw_list, arg_list, object_name, table_name = \ self.get_resource_b_kw_index(KeywordRecord) self.assertEqual( self.index.get_kw_for_index(kw_list, arg_list, table_name, object_name), l) l, kw_list, arg_list, object_name, table_name = \ self.get_test_a_kw_index(KeywordRecord) self.assertEqual( self.index.get_kw_for_index(kw_list, arg_list, table_name, object_name), l) l, kw_list, arg_list, object_name, table_name = self.get_s2l_kw_index( KeywordRecord) self.assertEqual( self.index.get_kw_for_index(kw_list, arg_list, table_name, object_name), l) def test_index_creation_test_a(self): table_name = self.test_a_table_name KeywordRecord = namedtuple('KeywordRecord', 'keyword argument object_name table_name') kw_list = [] kw_list.extend(self.get_test_a_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_common_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_resource_a_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_s2l_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_os_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_builtin_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_LibNoClass_kw_index(KeywordRecord)[0]) var_list = [ u'${TEST_A}', u'${RESOURCE_A}', u'${COMMON_VARIABLE_1}', u'${COMMON_VARIABLE_2}' ] t_index = {'keyword': kw_list, 'variable': var_list} r_index = self.index.create_index_for_table(self.db_dir, table_name) self.assertEqual(r_index['variable'].sort(), t_index['variable'].sort()) self.assertEqual(len(r_index['keyword']), len(t_index['keyword'])) self.assertEqual(r_index['keyword'].sort(), t_index['keyword'].sort()) def test_index_creation_test_b(self): table_name = self.test_b_table_name KeywordRecord = namedtuple('KeywordRecord', 'keyword argument object_name table_name') kw_list = [] kw_list.extend(self.get_test_b_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_common_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_resource_b_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_s2l_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_process_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_builtin_kw_index(KeywordRecord)[0]) var_list = [ u'${TEST_B}', u'${RESOURCE_B}', u'${COMMON_VARIABLE_1}', u'${COMMON_VARIABLE_2}' ] t_index = {'keyword': kw_list, 'variable': var_list} r_index = self.index.create_index_for_table(self.db_dir, table_name) self.assertEqual(r_index['variable'].sort(), t_index['variable'].sort()) self.assertEqual(len(r_index['keyword']), len(t_index['keyword'])) self.assertEqual(r_index['keyword'].sort(), t_index['keyword'].sort()) def test_index_consturctor(self): self.index.index_consturctor(self.resource_a_table_name) files = os.listdir(self.index_dir) self.assertEqual(len(files), 1) with open(os.path.join(self.index_dir, files[0])) as f: data = json.load(f) self.assertIn('variable', data) self.assertIn('keyword', data) self.assertFalse( any(kw[0] == 'Test A Keyword' for kw in data['keyword'])) self.assertTrue( any(kw[0] == 'Resource A Keyword 1' for kw in data['keyword'])) def test_get_kw_arguments(self): kw_args = [u'item', u'msg=None'] result = self.index.get_kw_arguments(kw_args) expected = [u'item', u'msg'] self.assertEqual(result, expected) kw_args = [u'name', u'*args'] result = self.index.get_kw_arguments(kw_args) self.assertEqual(result, kw_args) kw_args = [] result = self.index.get_kw_arguments(kw_args) self.assertEqual(result, kw_args) kw_args = [u'object=None', u'*args', u'**kwargs'] result = self.index.get_kw_arguments(kw_args) expected = [u'object', u'*args', u'**kwargs'] self.assertEqual(result, expected) kw_args = [u'${kwa1}', '@{list}', '&{kwargs}'] result = self.index.get_kw_arguments(kw_args) expected = [u'kwa1', '*list', '**kwargs'] self.assertEqual(result, expected) def test_add_xml_libraries(self): self.assertEqual(len(self.index.queue.queue), 0) self.index.add_xml_libraries(self.xml_libs) self.assertEqual(len(self.index.queue.queue), 2) def test_index_with_xml_libraries(self): xml_libs = os.path.join(env.RESOURCES_DIR, 'library') db_dir_with_xml = os.path.join(env.RESULTS_DIR, 'db_dir_with_xml') scanner = Scanner(xml_libs) scanner.scan(self.suite_dir, 'robot', db_dir_with_xml) index = Index(db_dir_with_xml, self.index_dir, self.xml_libs) index.index_consturctor(self.resource_a_table_name) files = os.listdir(self.index_dir) self.assertEqual(len(files), 1) with open(os.path.join(self.index_dir, files[0])) as f: data = json.load(f) self.assertTrue(any(kw[2] == 'SwingLibrary' for kw in data['keyword'])) self.assertTrue( any(kw[0] == 'Add Table Cell Selection' for kw in data['keyword'])) self.assertTrue( any(kw[0] == 'Select From Popup Menu' for kw in data['keyword'])) def test_get_object_name(self): object_name = self.index.get_object_name(self.get_libnoclass()) self.assertEqual(object_name, 'LibNoClass') object_name = self.index.get_object_name(self.get_resource_b()) self.assertEqual(object_name, 'resource_b') object_name = self.index.get_object_name(self.get_os()) self.assertEqual(object_name, 'OperatingSystem') object_name = self.index.get_object_name(self.get_s2l()) self.assertEqual(object_name, 'Selenium2Library') @property def common_table_name_index(self): index = 'index-{0}'.format(self.common_table_name) return os.path.join(self.index_dir, index) @property def test_a_table_name_index(self): index = 'index-{0}'.format(self.test_a_table_name) return os.path.join(self.index_dir, index) @property def real_suite_table_name(self): return rf_table_name( os.path.normcase( os.path.join(self.real_suite_dir, 'test', 'real_suite.robot'))) @property def resource_b_table_name(self): return rf_table_name( os.path.normcase(os.path.join(self.suite_dir, 'resource_b.robot'))) @property def common_table_name(self): return rf_table_name( os.path.normcase(os.path.join(self.suite_dir, 'common.robot'))) @property def test_a_table_name(self): return rf_table_name( os.path.normcase(os.path.join(self.suite_dir, 'test_a.robot'))) @property def test_b_table_name(self): return rf_table_name( os.path.normcase(os.path.join(self.suite_dir, 'test_b.robot'))) @property def resource_a_table_name(self): return rf_table_name( os.path.normcase(os.path.join(self.suite_dir, 'resource_a.robot'))) @property def s2l_table_name(self): return lib_table_name('Selenium2Library') @property def os_table_name(self): return lib_table_name('OperatingSystem') @property def process_table_name(self): return lib_table_name('Process') @property def builtin_table_name(self): return lib_table_name('BuiltIn') @property def libnoclass_table_name(self): return lib_table_name('LibNoClass') def get_resource_b(self): f = open(os.path.join(self.db_dir, self.resource_b_table_name)) return json.load(f) def get_common(self): f = open(os.path.join(self.db_dir, self.common_table_name)) return json.load(f) def get_test_a(self): f = open(os.path.join(self.db_dir, self.test_a_table_name)) return json.load(f) def get_s2l(self): f = open(os.path.join(self.db_dir, self.s2l_table_name)) return json.load(f) def get_os(self): f = open(os.path.join(self.db_dir, self.os_table_name)) return json.load(f) def get_process(self): f = open(os.path.join(self.db_dir, self.process_table_name)) return json.load(f) def getbuiltin(self): f = open(os.path.join(self.db_dir, self.builtin_table_name)) return json.load(f) def get_libnoclass(self): f = open(os.path.join(self.db_dir, self.libnoclass_table_name)) return json.load(f) def get_s2l_kw_index(self, keywordrecord): s2l_data = self.get_s2l() kw_list = self.index.get_keywords(s2l_data)[0] arg_list = self.get_kw_args(s2l_data) object_name = 'Selenium2Library' table_name = self.s2l_table_name l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord(keyword=kw, argument=arg, object_name=object_name, table_name=table_name)) return l, kw_list, arg_list, object_name, table_name def get_os_kw_index(self, keywordrecord): os_data = self.get_os() kw_list = self.index.get_keywords(os_data)[0] arg_list = self.get_kw_args(os_data) object_name = 'OperatingSystem' table_name = self.os_table_name l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord(keyword=kw, argument=arg, object_name=object_name, table_name=table_name)) return l, kw_list, arg_list, object_name, table_name def get_process_kw_index(self, keywordrecord): data = self.get_process() kw_list = self.index.get_keywords(data)[0] arg_list = self.get_kw_args(data) object_name = 'Process' table_name = self.process_table_name l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord(keyword=kw, argument=arg, object_name=object_name, table_name=table_name)) return l, kw_list, arg_list, object_name, table_name def get_builtin_kw_index(self, keywordrecord): data = self.getbuiltin() kw_list = self.index.get_keywords(data)[0] arg_list = self.get_kw_args(data) object_name = 'BuiltIn' table_name = self.builtin_table_name l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord(keyword=kw, argument=arg, object_name=object_name, table_name=table_name)) return l, kw_list, arg_list, object_name, table_name def get_LibNoClass_kw_index(self, keywordrecord): data = self.get_libnoclass() kw_list = self.index.get_keywords(data)[0] arg_list = self.get_kw_args(data) object_name = 'LibNoClass' table_name = self.libnoclass_table_name l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord(keyword=kw, argument=arg, object_name=object_name, table_name=table_name)) return l, kw_list, arg_list, object_name, table_name def get_test_a_kw_index(self, keywordrecord): kw_list = [u'Test A Keyword', u'Keyword'] arg_list = [None, None] table_name = self.test_a_table_name object_name = u'test_a.robot' l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord(keyword=kw, argument=arg, object_name=object_name, table_name=table_name)) return l, kw_list, arg_list, object_name, table_name def get_test_b_kw_index(self, keywordrecord): kw_list = [] table_name = self.test_b_table_name object_name = u'test_a.robot' l = [] return l, kw_list, [None], object_name, table_name def get_resource_a_kw_index(self, keywordrecord): kw_list = [u'Resource A Keyword 1', u'resource A Keyword 2'] arg_list = ['kwa1', None] table_name = self.resource_a_table_name object_name = u'resource_a.robot' l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord(keyword=kw, argument=arg, object_name=object_name, table_name=table_name)) return l, kw_list, arg_list, object_name, table_name def get_resource_b_kw_index(self, keywordrecord): kw_list = [u'Resource B Keyword 1', u'resource B Keyword 2'] arg_list = ['kwb1', None] table_name = self.resource_b_table_name object_name = u'resource_b.robot' l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord(keyword=kw, argument=arg, object_name=object_name, table_name=table_name)) return l, kw_list, arg_list, object_name, table_name def get_common_kw_index(self, keywordrecord): kw_list = [ u'Common Keyword 2', u'common Keyword 1', u'Really Long Keyword To Test With Jumping To Keyword Does Not Scroll The Visible Area To A Wrong Place Should There Be More Words' ] table_name = self.common_table_name object_name = u'common.robot' l = [] for kw in kw_list: l.append( keywordrecord(keyword=kw, argument=None, object_name=object_name, table_name=table_name)) return l, kw_list, [None], object_name, table_name def get_kw_args(self, data): arg_list = [] kws = data["keywords"] for i in kws.iterkeys(): args = kws[i]['keyword_arguments'] for arg in args: if '=' in arg: arg_list.append(arg.split('=')[0]) else: arg_list.append(arg) return arg_list
def test_obtain_text_of_a_document(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) res = self.index_.obtain_text_of_a_document(1) self.assertEqual( res, """ Public Order Minister Tassos Sehiotis resigned Monday after a Greek-American banker indicted in a $30 million financial scandal fled the country, apparently aboard a yacht. The conservative opposition immediately demanded the resignation of Premier Andreas Papandreou's socialist government, claiming it was staging a cover-up. Banker George Koskotas, 34, disappeared Saturday afternoon. A police officer, speaking on condition of anonymity, said Koskotas fled abroad on Sunday, apparently by yacht from the seaside village of Megalo Pefko, 20 miles from Athens. Sehiotis said a warrant had been issued for Koskotas' arrest. One week ago, Koskotas was banned from leaving Greece pending the outcome of an official enquiry into alleged financial irregularities at the Bank of Crete, which he controls. Sehiotis, whose ministry was responsible for police surveillance of Koskotas, said he was resigning ``since such (public order ministry) omissions ... create an issue of political sensitivity.'' The scandal has shaken the government because of accusations in Greek newspapers that senior socialist officials were involved in illegal deals set up by the Bank of Crete. The socialists also have been criticized for permitting Koskotas to build a multi-million dollar banking and media empire in Greece since 1984 without adequate checks by the central bank on his financial background. The government last week pledged ``absolute clarity'' in uncovering the scandal and warned there will be ``no pardons' for members of the ruling Panhellenic Socialist Movement (PASOK) who may be implicated. ``The Greek people are left with the conviction that George Koskotas was spirited away so that he would not speak. The responsibility goes all the way to the top of the government pyramid,'' Constantine Mitsotakis, leader of the New Democracy main opposition party party, said in a statement demanding the government resign. Koskotas was suspended Oct. 20 as chairman of the Bank of Crete and indicted on five counts of forgery and embezzlement. Last week Koskotas appeared before a district attorney on a charge of forging documents purporting to show that the Bank of Crete had $13 million invested with the American brokerage firm Merrill Lynch. He was not detained but given until Nov. 14 to prepare his defense. Koskotas also is accused of forging documents purporting to show his bank had another $17 million in an account with an American bank, Irving Trust Corp. Both U.S. firms have said they had no record of the deposits. Koskotas, who holds both American and Greek citizenship, bought a controlling interest in the Bank of Crete in 1984 after working in its central Athens branch for six years as an accountant. Rival newspapers have claimed Koskotas illegally used Bank of Crete money to fund his publishing group Grammi, which controls three daily newspapers, five magazines and a radio station. Koskotas resigned Oct. 29 as chairman of Grammi, the day after the premier's son, Education Minister George Papandreou, denounced as a forgery a Bank of Crete statement showing a $2.3 million transfer to a Merrill Lynch account in his name. The younger Papandreou showed reporters a letter from a New York lawyer saying there was no record at Merrill Lynch of such a transfer. Koskotas' parents, brother, wife and five children all have left Greece during the past week. """)
class TestIndex(TestCase): def setUp(self): self.parameters = Parameters() self.parameters.params["repo_dir"] = '../index/test_files/index' self.index_ = Index(self.parameters) def test_uw_expression_count(self): self.assertEqual(self.index_.uw_expression_count("SAMPSON Dog", 12), 2) def test_od_expression_count(self): self.assertEqual(self.index_.od_expression_count("SAMPSON True", 12), 1) def test_uw_document_expression_count(self): self.assertEqual( self.index_.uw_expression_document_count("SAMPSON True", 12), 1) def test_od_document_expression_count(self): self.assertEqual( self.index_.od_expression_document_count("SAMPSON True", 12), 1) def test_term_count(self): self.assertEqual(self.index_.term_count("dog"), 2) self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) self.assertEqual(self.index_.term_count("emotional"), 3515) def test_document_count(self): self.assertEqual(self.index_.document_count("dog"), 1) self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) self.assertEqual(self.index_.document_count("emotional"), 2973) def test_check_if_have_same_stem(self): self.assertEqual(self.index_.check_if_have_same_stem("goes", "goe"), True) self.assertEqual(self.index_.check_if_have_same_stem("goes", "g"), False) self.assertEqual(self.index_.check_if_have_same_stem("first", "mr"), False) def test_idf(self): self.assertEqual(self.index_.idf("dog"), 1.0986122886681098) self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) self.assertEqual(self.index_.document_count("first"), 0) def test_tfidf(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) doc_words = SimpleDocument(self.parameters).get_words( "../configs/others/pride_and_prejudice_wiki.txt") tfidf_1 = self.index_.tfidf('emotional', doc_words) print(tfidf_1, file=sys.stderr) self.assertEqual(tfidf_1, 2.0455597255490345) with self.assertRaises(Exception) as context: self.index_.tfidf('is', doc_words) self.assertTrue( 'unigram "is" not exist. Probably was a stopword in indexing.' in str(context.exception)) def test_tf(self): doc_words = SimpleDocument(self.parameters).get_words( "../configs/others/pride_and_prejudice_wiki.txt") self.assertEqual(self.index_.tf("dog", doc_words), 0.5) self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) self.assertEqual(self.index_.tf("emotional", doc_words), 0.5) def test_check_if_exists_in_index(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) self.assertTrue(self.index_.check_if_exists_in_index("emotional")) self.assertFalse(self.index_.check_if_exists_in_index("first")) self.assertFalse(self.index_.check_if_exists_in_index("included")) self.assertTrue(self.index_.check_if_exists_in_index("includes")) def test_obtain_text_of_a_document(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) res = self.index_.obtain_text_of_a_document(1) self.assertEqual( res, """ Public Order Minister Tassos Sehiotis resigned Monday after a Greek-American banker indicted in a $30 million financial scandal fled the country, apparently aboard a yacht. The conservative opposition immediately demanded the resignation of Premier Andreas Papandreou's socialist government, claiming it was staging a cover-up. Banker George Koskotas, 34, disappeared Saturday afternoon. A police officer, speaking on condition of anonymity, said Koskotas fled abroad on Sunday, apparently by yacht from the seaside village of Megalo Pefko, 20 miles from Athens. Sehiotis said a warrant had been issued for Koskotas' arrest. One week ago, Koskotas was banned from leaving Greece pending the outcome of an official enquiry into alleged financial irregularities at the Bank of Crete, which he controls. Sehiotis, whose ministry was responsible for police surveillance of Koskotas, said he was resigning ``since such (public order ministry) omissions ... create an issue of political sensitivity.'' The scandal has shaken the government because of accusations in Greek newspapers that senior socialist officials were involved in illegal deals set up by the Bank of Crete. The socialists also have been criticized for permitting Koskotas to build a multi-million dollar banking and media empire in Greece since 1984 without adequate checks by the central bank on his financial background. The government last week pledged ``absolute clarity'' in uncovering the scandal and warned there will be ``no pardons' for members of the ruling Panhellenic Socialist Movement (PASOK) who may be implicated. ``The Greek people are left with the conviction that George Koskotas was spirited away so that he would not speak. The responsibility goes all the way to the top of the government pyramid,'' Constantine Mitsotakis, leader of the New Democracy main opposition party party, said in a statement demanding the government resign. Koskotas was suspended Oct. 20 as chairman of the Bank of Crete and indicted on five counts of forgery and embezzlement. Last week Koskotas appeared before a district attorney on a charge of forging documents purporting to show that the Bank of Crete had $13 million invested with the American brokerage firm Merrill Lynch. He was not detained but given until Nov. 14 to prepare his defense. Koskotas also is accused of forging documents purporting to show his bank had another $17 million in an account with an American bank, Irving Trust Corp. Both U.S. firms have said they had no record of the deposits. Koskotas, who holds both American and Greek citizenship, bought a controlling interest in the Bank of Crete in 1984 after working in its central Athens branch for six years as an accountant. Rival newspapers have claimed Koskotas illegally used Bank of Crete money to fund his publishing group Grammi, which controls three daily newspapers, five magazines and a radio station. Koskotas resigned Oct. 29 as chairman of Grammi, the day after the premier's son, Education Minister George Papandreou, denounced as a forgery a Bank of Crete statement showing a $2.3 million transfer to a Merrill Lynch account in his name. The younger Papandreou showed reporters a letter from a New York lawyer saying there was no record at Merrill Lynch of such a transfer. Koskotas' parents, brother, wife and five children all have left Greece during the past week. """) def test_obtain_term_ids_of_a_document(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) res = self.index_.obtain_term_ids_of_a_document(1) self.assertEqual( res, ('AP881107-0001', (147, 771, 0, 78064, 26, 2828, 1283, 92, 126, 147, 175009, 159395, 771, 55, 0, 0, 2362, 26, 2828, 919, 0, 0, 115, 8, 461, 1624, 1826, 0, 35, 693, 1198, 0, 195412, 0, 724, 430, 621, 340, 0, 771, 0, 1502, 20649, 4327, 1620, 9, 247, 0, 0, 866, 0, 643, 0, 2828, 415, 101374, 1289, 2015, 276, 1246, 0, 24, 29, 586, 0, 272, 0, 856, 0, 101374, 1826, 2153, 0, 174, 693, 0, 195412, 0, 0, 158999, 1037, 0, 117013, 137162, 123, 157, 0, 4415, 159395, 0, 0, 2262, 0, 0, 56, 0, 101374, 251, 0, 0, 189, 101374, 0, 569, 0, 332, 3095, 1873, 0, 2974, 0, 0, 13, 63630, 0, 485, 461, 91464, 0, 0, 91, 0, 50405, 0, 0, 156, 159395, 0, 690, 0, 347, 0, 24, 4049, 0, 101374, 0, 0, 0, 771, 0, 0, 92, 126, 690, 131907, 609, 0, 56, 0, 88, 2222, 0, 0, 1624, 0, 160974, 0, 9, 0, 0, 436, 0, 2362, 273, 0, 774, 1620, 13, 0, 263, 0, 887, 339, 176, 0, 0, 0, 91, 0, 50405, 0, 1620, 0, 0, 0, 289, 0, 1202, 101374, 0, 254, 0, 4543, 8, 193, 91, 0, 979, 3597, 0, 3095, 0, 791, 0, 2768, 937, 0, 0, 264, 91, 0, 0, 461, 2464, 0, 9, 0, 0, 1333, 2198, 45622, 0, 4433, 0, 1624, 0, 678, 0, 0, 0, 0, 3790, 0, 40, 0, 0, 118, 135313, 1620, 727, 136295, 0, 0, 0, 3408, 0, 2362, 6, 0, 245, 0, 0, 494, 0, 415, 101374, 0, 2042, 435, 0, 0, 0, 0, 0, 586, 0, 347, 1461, 0, 0, 116, 0, 0, 354, 0, 0, 9, 145130, 0, 48310, 120515, 51, 0, 0, 1, 956, 540, 430, 32, 32, 0, 0, 0, 221, 340, 0, 9, 771, 101374, 0, 1431, 897, 123, 0, 248, 0, 0, 91, 0, 50405, 0, 919, 0, 114, 667, 0, 70387, 0, 62918, 0, 0, 101374, 261, 0, 0, 311, 212, 0, 0, 61, 0, 4322, 767, 144897, 0, 62, 0, 0, 91, 0, 50405, 0, 375, 8, 546, 0, 0, 26, 3527, 543, 3657, 3224, 0, 0, 0, 2223, 0, 533, 0, 1019, 367, 0, 674, 0, 165, 101374, 0, 0, 436, 0, 4322, 767, 144897, 0, 62, 0, 91, 0, 0, 445, 8, 0, 0, 550, 0, 0, 26, 91, 3533, 1512, 285, 0, 0, 543, 0, 0, 0, 0, 0, 192, 0, 0, 1798, 101374, 0, 328, 0, 26, 0, 2362, 45369, 1390, 0, 156, 167, 0, 0, 91, 0, 50405, 0, 791, 0, 28, 0, 0, 264, 4415, 1977, 0, 187, 15, 0, 0, 550, 1483, 273, 0, 247, 101374, 887, 0, 91, 0, 50405, 168, 0, 303, 0, 515, 34, 77704, 0, 156, 23, 701, 273, 114, 1056, 0, 0, 409, 490, 101374, 771, 897, 899, 0, 248, 0, 77704, 0, 0, 0, 0, 1502, 599, 463, 147, 415, 4327, 2512, 0, 0, 70387, 0, 91, 0, 50405, 221, 62, 0, 17, 41, 8, 1548, 0, 0, 3657, 3224, 550, 0, 0, 171, 0, 2597, 4327, 62, 4, 0, 512, 0, 0, 1, 73, 532, 7, 0, 0, 0, 192, 0, 3657, 3224, 0, 0, 0, 1548, 101374, 626, 918, 455, 0, 114, 219, 0, 0, 245, 3095, 0, 0, 333, 0))) def test_obtain_terms_of_a_document(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) res = self.index_.obtain_terms_of_a_document(1) print(res, file=sys.stderr) self.assertEqual(res, ('AP881107-0001', ( 'minist', 'resign', '', 'gree', 'american', 'banker', 'escap', 'public', 'order', 'minist', 'tasso', 'sehioti', 'resign', 'mondai', '', '', 'greek', 'american', 'banker', 'indict', '', '', '30', 'million', 'financi', 'scandal', 'fled', '', 'countri', 'appar', 'aboard', '', 'yacht', '', 'conserv', 'opposit', 'immedi', 'demand', '', 'resign', '', 'premier', 'andrea', 'papandr', 'socialist', 'govern', 'claim', '', '', 'stage', '', 'cover', '', 'banker', 'georg', 'koskota', '34', 'disappear', 'saturdai', 'afternoon', '', 'polic', 'offic', 'speak', '', 'condit', '', 'anonym', '', 'koskota', 'fled', 'abroad', '', 'sundai', 'appar', '', 'yacht', '', '', 'seasid', 'villag', '', 'megalo', 'pefko', '20', 'mile', '', 'athen', 'sehioti', '', '', 'warrant', '', '', 'issu', '', 'koskota', 'arrest', '', '', 'ago', 'koskota', '', 'ban', '', 'leav', 'greec', 'pend', '', 'outcom', '', '', 'offici', 'enquiri', '', 'alleg', 'financi', 'irregular', '', '', 'bank', '', 'crete', '', '', 'control', 'sehioti', '', 'ministri', '', 'respons', '', 'polic', 'surveil', '', 'koskota', '', '', '', 'resign', '', '', 'public', 'order', 'ministri', 'omiss', 'creat', '', 'issu', '', 'polit', 'sensit', '', '', 'scandal', '', 'shaken', '', 'govern', '', '', 'accus', '', 'greek', 'newspap', '', 'senior', 'socialist', 'offici', '', 'involv', '', 'illeg', 'deal', 'set', '', '', '', 'bank', '', 'crete', '', 'socialist', '', '', '', 'critic', '', 'permit', 'koskota', '', 'build', '', 'multi', 'million', 'dollar', 'bank', '', 'media', 'empir', '', 'greec', '', '1984', '', 'adequ', 'check', '', '', 'central', 'bank', '', '', 'financi', 'background', '', 'govern', '', '', 'pledg', 'absolut', 'clariti', '', 'uncov', '', 'scandal', '', 'warn', '', '', '', '', 'pardon', '', 'member', '', '', 'rule', 'panhellen', 'socialist', 'movement', 'pasok', '', '', '', 'implic', '', 'greek', 'peopl', '', 'left', '', '', 'convict', '', 'georg', 'koskota', '', 'spirit', 'awai', '', '', '', '', '', 'speak', '', 'respons', 'goe', '', '', 'wai', '', '', 'top', '', '', 'govern', 'pyramid', '', 'constantin', 'mitsotaki', 'leader', '', '', 'new', 'democraci', 'main', 'opposit', 'parti', 'parti', '', '', '', 'statement', 'demand', '', 'govern', 'resign', 'koskota', '', 'suspend', 'oct', '20', '', 'chairman', '', '', 'bank', '', 'crete', '', 'indict', '', 'five', 'count', '', 'forgeri', '', 'embezzl', '', '', 'koskota', 'appear', '', '', 'district', 'attornei', '', '', 'charg', '', 'forg', 'document', 'purport', '', 'show', '', '', 'bank', '', 'crete', '', '13', 'million', 'invest', '', '', 'american', 'brokerag', 'firm', 'merril', 'lynch', '', '', '', 'detain', '', 'given', '', 'nov', '14', '', 'prepar', '', 'defens', 'koskota', '', '', 'accus', '', 'forg', 'document', 'purport', '', 'show', '', 'bank', '', '', '17', 'million', '', '', 'account', '', '', 'american', 'bank', 'irv', 'trust', 'corp', '', '', 'firm', '', '', '', '', '', 'record', '', '', 'deposit', 'koskota', '', 'hold', '', 'american', '', 'greek', 'citizenship', 'bought', '', 'control', 'interest', '', '', 'bank', '', 'crete', '', '1984', '', 'work', '', '', 'central', 'athen', 'branch', '', 'six', 'year', '', '', 'account', 'rival', 'newspap', '', 'claim', 'koskota', 'illeg', '', 'bank', '', 'crete', 'monei', '', 'fund', '', 'publish', 'group', 'grammi', '', 'control', 'three', 'daili', 'newspap', 'five', 'magazin', '', '', 'radio', 'station', 'koskota', 'resign', 'oct', '29', '', 'chairman', '', 'grammi', '', '', '', '', 'premier', 'son', 'educ', 'minist', 'georg', 'papandr', 'denounc', '', '', 'forgeri', '', 'bank', '', 'crete', 'statement', 'show', '', '2', '3', 'million', 'transfer', '', '', 'merril', 'lynch', 'account', '', '', 'name', '', 'younger', 'papandr', 'show', 'report', '', 'letter', '', '', 'new', 'york', 'lawyer', 'sai', '', '', '', 'record', '', 'merril', 'lynch', '', '', '', 'transfer', 'koskota', 'parent', 'brother', 'wife', '', 'five', 'children', '', '', 'left', 'greec', '', '', 'past', ''))) def test_term(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) res = self.index_.term(147) print(res, file=sys.stderr) self.assertEqual(res, 'minist') def test_expression_list(self): self.assertEqual(self.index_.expression_list("SAMPSON Dog", "#uw", 12), {'romeo': 2}) self.assertEqual(self.index_.expression_list("your", "#uw", 12), { 'hamlet': 1, 'romeo': 3 }) def test_run_query(self): self.index_.init_query_env() self.assertEqual(self.index_.run_query("you"), ((3, -4.207161834249701), (2, -4.27477458192466))) def test_run_query_doc_names(self): self.index_.init_query_env() self.assertEqual(self.index_.run_query_doc_names("you"), ['romeo', 'hamlet']) def test_get_ext_document_id(self): self.assertEqual(self.index_.get_ext_document_id(1), 'lorem') def test_expression_list_in_top_docs(self): self.index_.init_query_env() runs = self.index_.run_query_doc_names("a") self.assertEqual( self.index_.expression_list_in_top_docs("you", "#uw", 12, 2, runs), { 'hamlet': 1, 'romeo': 9 }) self.assertEqual( self.index_.expression_list_in_top_docs("you", "#uw", 12, 1, runs), {'romeo': 9}) def test_document_length_doc_name(self): self.assertEqual(self.index_.document_length_doc_name('lorem'), 88) self.assertEqual(self.index_.document_length_doc_name('hamlet'), 71) def test_document_length_docs_names(self): self.assertEqual( self.index_.document_length_docs_names(['lorem', 'hamlet']), 159) def test_expand_query(self): self.assertEqual( self.index_.expand_query('consectetur adipiscing', 10, 10, ['lorem', 'hamlet']), [ 'francisco', 'bernardo', 'i', 'at', 'nulla', 'consectetur', 'in', 'eget', 'and', 'the' ])
class TestIndexing(unittest.TestCase): """The content of the db_fir was created with scanner by scanning the TEST_DATA_DIR/suite_tree folder. If scanner is changed, db_dir must be recreated.""" @classmethod def setUpClass(cls): cls.db_dir = os.path.join( env.RESULTS_DIR, 'db_dir' ) cls.suite_dir = os.path.join( env.TEST_DATA_DIR, 'suite_tree' ) scanner = Scanner() scanner.scan( cls.suite_dir, 'robot', cls.db_dir) cls.xml_libs = os.path.join( env.RESOURCES_DIR, 'library' ) def setUp(self): self.index_dir = os.path.join( env.RESULTS_DIR, 'index_dir', ) if os.path.exists(self.index_dir): while os.path.exists(self.index_dir): shutil.rmtree(self.index_dir) sleep(0.1) os.makedirs(self.index_dir) self.index = Index(self.db_dir, self.index_dir) def test_parse_table_data(self): t_name = os.path.join( env.RESOURCES_DIR, 'BuiltIn-ca8f2e8d70641ce17b9b304086c19657.json' ) self.index.queue.add(t_name, None, None) data, status = self.index.read_table( os.path.join(env.RESOURCES_DIR, t_name)) var, kw_index = self.index.parse_table_data(data, t_name) self.assertTrue(u'${/}' in var) self.assertTrue('${OUTPUT_FILE}' in var) self.assertTrue('@{TEST_TAGS}' in var) def test_add_builtin(self): self.index.add_builtin_to_queue(self.db_dir) self.assertTrue(len(self.index.queue.queue) > 0) def test_read_table(self): data, read_status = self.index.read_table( os.path.join( self.db_dir, self.test_b_table_name)) self.assertTrue(data['file_name'], 'test_b.robot') def test_get_keywords_resource(self): data = self.get_resource_b() expected_kw_list = ['Resource B Keyword 2', 'Resource B Keyword 1'] expected_arg_list = [['kwb1'], []] kw_list, arg_list = self.index.get_keywords(data) self.assertEqual(kw_list, expected_kw_list) self.assertEqual(arg_list.sort(), expected_arg_list.sort()) data = self.get_test_a() expected_kw_list = ['Test A Keyword', 'Keyword'] kw_list, arg_list = self.index.get_keywords(data) self.assertEqual(kw_list, expected_kw_list) self.assertEqual(arg_list, [[], []]) data = self.get_s2l() parsed_kw, arg_list = self.index.get_keywords(data) self.assertTrue('Set Window Position' in parsed_kw) self.assertTrue('Get Cookies' in parsed_kw) self.assertTrue('Unselect Frame' in parsed_kw) self.assertTrue(['name'] in arg_list) l = ['driver_name', 'alias', 'kwargs', '**init_kwargs'] self.assertTrue(l in arg_list) self.assertTrue(['*code'] in arg_list) def test_get_imports(self): data = self.get_resource_b() import_list = [self.process_table_name] self.assertEqual(self.index.get_imports(data), import_list) data = self.get_test_a() import_list = [ self.common_table_name, self.resource_a_table_name] self.assertEqual( self.index.get_imports(data).sort(), import_list.sort()) data = self.get_s2l() self.assertEqual(self.index.get_imports(data), []) def test_get_variables(self): data = self.get_resource_b() var = ['${RESOURCE_B}'] self.assertEqual(self.index.get_variables(data), var) data = self.get_test_a() var = ['${TEST_A}'] self.assertEqual( self.index.get_variables(data).sort(), var.sort()) data = self.get_s2l() self.assertEqual(self.index.get_variables(data), []) data = self.get_common() self.assertEqual(self.index.get_variables(data), []) def test_get_kw_for_index(self): KeywordRecord = namedtuple( 'KeywordRecord', 'keyword argument object_name table_name') table_name = self.resource_b_table_name l, kw_list, arg_list, object_name, table_name = \ self.get_resource_b_kw_index(KeywordRecord) self.assertEqual( self.index.get_kw_for_index( kw_list, arg_list, table_name, object_name), l) l, kw_list, arg_list, object_name, table_name = \ self.get_test_a_kw_index(KeywordRecord) self.assertEqual( self.index.get_kw_for_index( kw_list, arg_list, table_name, object_name), l) l, kw_list, arg_list, object_name, table_name = self.get_s2l_kw_index( KeywordRecord) self.assertEqual( self.index.get_kw_for_index( kw_list, arg_list, table_name, object_name), l) def test_index_creation_test_a(self): table_name = self.test_a_table_name KeywordRecord = namedtuple( 'KeywordRecord', 'keyword argument object_name table_name') kw_list = [] kw_list.extend(self.get_test_a_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_common_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_resource_a_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_s2l_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_os_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_builtin_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_LibNoClass_kw_index(KeywordRecord)[0]) var_list = [ u'${TEST_A}', u'${RESOURCE_A}', u'${COMMON_VARIABLE_1}', u'${COMMON_VARIABLE_2}' ] t_index = { 'keyword': kw_list, 'variable': var_list} r_index = self.index.create_index_for_table(self.db_dir, table_name) self.assertEqual( r_index['variable'].sort(), t_index['variable'].sort()) self.assertEqual(len(r_index['keyword']), len(t_index['keyword'])) self.assertEqual(r_index['keyword'].sort(), t_index['keyword'].sort()) def test_index_creation_test_b(self): table_name = self.test_b_table_name KeywordRecord = namedtuple( 'KeywordRecord', 'keyword argument object_name table_name') kw_list = [] kw_list.extend(self.get_test_b_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_common_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_resource_b_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_s2l_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_process_kw_index(KeywordRecord)[0]) kw_list.extend(self.get_builtin_kw_index(KeywordRecord)[0]) var_list = [ u'${TEST_B}', u'${RESOURCE_B}', u'${COMMON_VARIABLE_1}', u'${COMMON_VARIABLE_2}' ] t_index = { 'keyword': kw_list, 'variable': var_list} r_index = self.index.create_index_for_table(self.db_dir, table_name) self.assertEqual( r_index['variable'].sort(), t_index['variable'].sort()) self.assertEqual(len(r_index['keyword']), len(t_index['keyword'])) self.assertEqual(r_index['keyword'].sort(), t_index['keyword'].sort()) def test_index_consturctor(self): self.index.index_consturctor(self.resource_a_table_name) files = os.listdir(self.index_dir) self.assertEqual(len(files), 1) with open(os.path.join(self.index_dir, files[0])) as f: data = json.load(f) self.assertIn('variable', data) self.assertIn('keyword', data) self.assertFalse( any(kw[0] == 'Test A Keyword' for kw in data['keyword']) ) self.assertTrue( any(kw[0] == 'Resource A Keyword 1' for kw in data['keyword']) ) def test_get_kw_arguments(self): kw_args = [u'item', u'msg=None'] result = self.index.get_kw_arguments(kw_args) expected = [u'item', u'msg'] self.assertEqual(result, expected) kw_args = [u'name', u'*args'] result = self.index.get_kw_arguments(kw_args) self.assertEqual(result, kw_args) kw_args = [] result = self.index.get_kw_arguments(kw_args) self.assertEqual(result, kw_args) kw_args = [u'object=None', u'*args', u'**kwargs'] result = self.index.get_kw_arguments(kw_args) expected = [u'object', u'*args', u'**kwargs'] self.assertEqual(result, expected) kw_args = [u'${kwa1}', '@{list}', '&{kwargs}'] result = self.index.get_kw_arguments(kw_args) expected = [u'kwa1', '*list', '**kwargs'] self.assertEqual(result, expected) def test_add_xml_libraries(self): self.assertEqual(len(self.index.queue.queue), 0) self.index.add_xml_libraries(self.xml_libs) self.assertEqual(len(self.index.queue.queue), 2) def test_index_with_xml_libraries(self): xml_libs = os.path.join( env.RESOURCES_DIR, 'library' ) db_dir_with_xml = os.path.join( env.RESULTS_DIR, 'db_dir_with_xml') scanner = Scanner(xml_libs) scanner.scan( self.suite_dir, 'robot', db_dir_with_xml ) index = Index(db_dir_with_xml, self.index_dir, self.xml_libs) index.index_consturctor(self.resource_a_table_name) files = os.listdir(self.index_dir) self.assertEqual(len(files), 1) with open(os.path.join(self.index_dir, files[0])) as f: data = json.load(f) self.assertTrue( any(kw[2] == 'SwingLibrary' for kw in data['keyword']) ) self.assertTrue( any(kw[0] == 'Add Table Cell Selection' for kw in data['keyword']) ) self.assertTrue( any(kw[0] == 'Select From Popup Menu' for kw in data['keyword']) ) @property def common_table_name_index(self): index = 'index-{0}'.format(self.common_table_name) return os.path.join(self.index_dir, index) @property def test_a_table_name_index(self): index = 'index-{0}'.format(self.test_a_table_name) return os.path.join(self.index_dir, index) @property def real_suite_table_name(self): return rf_table_name( os.path.normcase( os.path.join( self.real_suite_dir, 'test', 'real_suite.robot' ) ) ) @property def resource_b_table_name(self): return rf_table_name( os.path.normcase(os.path.join(self.suite_dir, 'resource_b.robot')) ) @property def common_table_name(self): return rf_table_name( os.path.normcase(os.path.join(self.suite_dir, 'common.robot')) ) @property def test_a_table_name(self): return rf_table_name( os.path.normcase(os.path.join(self.suite_dir, 'test_a.robot')) ) @property def test_b_table_name(self): return rf_table_name( os.path.normcase(os.path.join(self.suite_dir, 'test_b.robot')) ) @property def resource_a_table_name(self): return rf_table_name(os.path.normcase( os.path.join(self.suite_dir, 'resource_a.robot')) ) @property def s2l_table_name(self): return lib_table_name('Selenium2Library') @property def os_table_name(self): return lib_table_name('OperatingSystem') @property def process_table_name(self): return lib_table_name('Process') @property def builtin_table_name(self): return lib_table_name('BuiltIn') @property def libnoclass_table_name(self): return lib_table_name('LibNoClass') def get_resource_b(self): f = open(os.path.join( self.db_dir, self.resource_b_table_name ) ) return json.load(f) def get_common(self): f = open(os.path.join( self.db_dir, self.common_table_name ) ) return json.load(f) def get_test_a(self): f = open(os.path.join( self.db_dir, self.test_a_table_name ) ) return json.load(f) def get_s2l(self): f = open(os.path.join( self.db_dir, self.s2l_table_name ) ) return json.load(f) def get_os(self): f = open(os.path.join( self.db_dir, self.os_table_name ) ) return json.load(f) def get_process(self): f = open(os.path.join( self.db_dir, self.process_table_name ) ) return json.load(f) def getbuiltin(self): f = open(os.path.join( self.db_dir, self.builtin_table_name ) ) return json.load(f) def get_libnoclass(self): f = open(os.path.join( self.db_dir, self.libnoclass_table_name ) ) return json.load(f) def get_s2l_kw_index(self, keywordrecord): s2l_data = self.get_s2l() kw_list = self.index.get_keywords(s2l_data)[0] arg_list = self.get_kw_args(s2l_data) object_name = 'Selenium2Library' table_name = self.s2l_table_name l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord( keyword=kw, argument=arg, object_name=object_name, table_name=table_name ) ) return l, kw_list, arg_list, object_name, table_name def get_os_kw_index(self, keywordrecord): os_data = self.get_os() kw_list = self.index.get_keywords(os_data)[0] arg_list = self.get_kw_args(os_data) object_name = 'OperatingSystem' table_name = self.os_table_name l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord( keyword=kw, argument=arg, object_name=object_name, table_name=table_name ) ) return l, kw_list, arg_list, object_name, table_name def get_process_kw_index(self, keywordrecord): data = self.get_process() kw_list = self.index.get_keywords(data)[0] arg_list = self.get_kw_args(data) object_name = 'Process' table_name = self.process_table_name l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord( keyword=kw, argument=arg, object_name=object_name, table_name=table_name ) ) return l, kw_list, arg_list, object_name, table_name def get_builtin_kw_index(self, keywordrecord): data = self.getbuiltin() kw_list = self.index.get_keywords(data)[0] arg_list = self.get_kw_args(data) object_name = 'BuiltIn' table_name = self.builtin_table_name l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord( keyword=kw, argument=arg, object_name=object_name, table_name=table_name ) ) return l, kw_list, arg_list, object_name, table_name def get_LibNoClass_kw_index(self, keywordrecord): data = self.get_libnoclass() kw_list = self.index.get_keywords(data)[0] arg_list = self.get_kw_args(data) object_name = 'BuiltIn' table_name = self.builtin_table_name l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord( keyword=kw, argument=arg, object_name=object_name, table_name=table_name ) ) return l, kw_list, arg_list, object_name, table_name def get_test_a_kw_index(self, keywordrecord): kw_list = [u'Test A Keyword', u'Keyword'] arg_list = [None, None] table_name = self.test_a_table_name object_name = u'test_a.robot' l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord( keyword=kw, argument=arg, object_name=object_name, table_name=table_name ) ) return l, kw_list, arg_list, object_name, table_name def get_test_b_kw_index(self, keywordrecord): kw_list = [] table_name = self.test_b_table_name object_name = u'test_a.robot' l = [] return l, kw_list, [None], object_name, table_name def get_resource_a_kw_index(self, keywordrecord): kw_list = [u'Resource A Keyword 1', u'resource A Keyword 2'] arg_list = ['kwa1', None] table_name = self.resource_a_table_name object_name = u'resource_a.robot' l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord( keyword=kw, argument=arg, object_name=object_name, table_name=table_name ) ) return l, kw_list, arg_list, object_name, table_name def get_resource_b_kw_index(self, keywordrecord): kw_list = [u'Resource B Keyword 1', u'resource B Keyword 2'] arg_list = ['kwb1', None] table_name = self.resource_b_table_name object_name = u'resource_b.robot' l = [] for kw, arg in zip(kw_list, arg_list): l.append( keywordrecord( keyword=kw, argument=arg, object_name=object_name, table_name=table_name ) ) return l, kw_list, arg_list, object_name, table_name def get_common_kw_index(self, keywordrecord): kw_list = [ u'Common Keyword 2', u'common Keyword 1', u'Really Long Keyword To Test With Jumping To Keyword Does Not Scroll The Visible Area To A Wrong Place Should There Be More Words' ] table_name = self.common_table_name object_name = u'common.robot' l = [] for kw in kw_list: l.append( keywordrecord( keyword=kw, argument=None, object_name=object_name, table_name=table_name ) ) return l, kw_list, [None], object_name, table_name def get_kw_args(self, data): arg_list = [] kws = data["keywords"] for i in kws.iterkeys(): args = kws[i]['keyword_arguments'] for arg in args: if '=' in arg: arg_list.append(arg.split('=')[0]) else: arg_list.append(arg) return arg_list
def __init__(self, word2vec_, parameters): self.word2vec_model = word2vec_.model self.single_document = SimpleDocument(parameters) self.index_ = Index(parameters) self.stop_words = set(nltk.corpus.stopwords.words('english'))
def test_term(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) res = self.index_.term(147) print(res, file=sys.stderr) self.assertEqual(res, 'minist')
def setUp(self): self.parameters = Parameters() self.parameters.params["repo_dir"] = '../index/test_files/index' self.index_ = Index(self.parameters)
def __init__(self, parameters): self.parameters = parameters self.enchant_dict = enchant.Dict("en_US") self.stopwords = stopwords.words('english') self.index_ = Index(self.parameters)
import sys from datetime import datetime from index.document import IndexDocument from index.index import Index from server.server import fields from index.utils import kendal_tau from metrics.utils import avg_sd index = Index() index.load() def validate_queries(queries): for i, query in enumerate(queries): if not query[0] in fields: print('{} is not a valid field (in query #{}). Try one of: {}'.format(query[0], i+1, str(fields))) sys.exit(1) def ranking_correlation(queries): validate_queries(queries) for query in queries: ranking_tfidf = index.get_documents_for_query(query[0], query[1], query[2], True) ranking_raw = index.get_documents_for_query(query[0], query[1], query[2], False) correlation = kendal_tau(ranking_tfidf, ranking_raw) print('For query [{}] on field [{}]: {} Kendal Tau correlation. Rankings with {} documents.'.format(query[1], query[0], correlation, query[2])) def query_response_time(queries): validate_queries(queries)
def test_obtain_terms_of_a_document(self): self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889' self.index_ = Index(self.parameters) res = self.index_.obtain_terms_of_a_document(1) print(res, file=sys.stderr) self.assertEqual(res, ('AP881107-0001', ( 'minist', 'resign', '', 'gree', 'american', 'banker', 'escap', 'public', 'order', 'minist', 'tasso', 'sehioti', 'resign', 'mondai', '', '', 'greek', 'american', 'banker', 'indict', '', '', '30', 'million', 'financi', 'scandal', 'fled', '', 'countri', 'appar', 'aboard', '', 'yacht', '', 'conserv', 'opposit', 'immedi', 'demand', '', 'resign', '', 'premier', 'andrea', 'papandr', 'socialist', 'govern', 'claim', '', '', 'stage', '', 'cover', '', 'banker', 'georg', 'koskota', '34', 'disappear', 'saturdai', 'afternoon', '', 'polic', 'offic', 'speak', '', 'condit', '', 'anonym', '', 'koskota', 'fled', 'abroad', '', 'sundai', 'appar', '', 'yacht', '', '', 'seasid', 'villag', '', 'megalo', 'pefko', '20', 'mile', '', 'athen', 'sehioti', '', '', 'warrant', '', '', 'issu', '', 'koskota', 'arrest', '', '', 'ago', 'koskota', '', 'ban', '', 'leav', 'greec', 'pend', '', 'outcom', '', '', 'offici', 'enquiri', '', 'alleg', 'financi', 'irregular', '', '', 'bank', '', 'crete', '', '', 'control', 'sehioti', '', 'ministri', '', 'respons', '', 'polic', 'surveil', '', 'koskota', '', '', '', 'resign', '', '', 'public', 'order', 'ministri', 'omiss', 'creat', '', 'issu', '', 'polit', 'sensit', '', '', 'scandal', '', 'shaken', '', 'govern', '', '', 'accus', '', 'greek', 'newspap', '', 'senior', 'socialist', 'offici', '', 'involv', '', 'illeg', 'deal', 'set', '', '', '', 'bank', '', 'crete', '', 'socialist', '', '', '', 'critic', '', 'permit', 'koskota', '', 'build', '', 'multi', 'million', 'dollar', 'bank', '', 'media', 'empir', '', 'greec', '', '1984', '', 'adequ', 'check', '', '', 'central', 'bank', '', '', 'financi', 'background', '', 'govern', '', '', 'pledg', 'absolut', 'clariti', '', 'uncov', '', 'scandal', '', 'warn', '', '', '', '', 'pardon', '', 'member', '', '', 'rule', 'panhellen', 'socialist', 'movement', 'pasok', '', '', '', 'implic', '', 'greek', 'peopl', '', 'left', '', '', 'convict', '', 'georg', 'koskota', '', 'spirit', 'awai', '', '', '', '', '', 'speak', '', 'respons', 'goe', '', '', 'wai', '', '', 'top', '', '', 'govern', 'pyramid', '', 'constantin', 'mitsotaki', 'leader', '', '', 'new', 'democraci', 'main', 'opposit', 'parti', 'parti', '', '', '', 'statement', 'demand', '', 'govern', 'resign', 'koskota', '', 'suspend', 'oct', '20', '', 'chairman', '', '', 'bank', '', 'crete', '', 'indict', '', 'five', 'count', '', 'forgeri', '', 'embezzl', '', '', 'koskota', 'appear', '', '', 'district', 'attornei', '', '', 'charg', '', 'forg', 'document', 'purport', '', 'show', '', '', 'bank', '', 'crete', '', '13', 'million', 'invest', '', '', 'american', 'brokerag', 'firm', 'merril', 'lynch', '', '', '', 'detain', '', 'given', '', 'nov', '14', '', 'prepar', '', 'defens', 'koskota', '', '', 'accus', '', 'forg', 'document', 'purport', '', 'show', '', 'bank', '', '', '17', 'million', '', '', 'account', '', '', 'american', 'bank', 'irv', 'trust', 'corp', '', '', 'firm', '', '', '', '', '', 'record', '', '', 'deposit', 'koskota', '', 'hold', '', 'american', '', 'greek', 'citizenship', 'bought', '', 'control', 'interest', '', '', 'bank', '', 'crete', '', '1984', '', 'work', '', '', 'central', 'athen', 'branch', '', 'six', 'year', '', '', 'account', 'rival', 'newspap', '', 'claim', 'koskota', 'illeg', '', 'bank', '', 'crete', 'monei', '', 'fund', '', 'publish', 'group', 'grammi', '', 'control', 'three', 'daili', 'newspap', 'five', 'magazin', '', '', 'radio', 'station', 'koskota', 'resign', 'oct', '29', '', 'chairman', '', 'grammi', '', '', '', '', 'premier', 'son', 'educ', 'minist', 'georg', 'papandr', 'denounc', '', '', 'forgeri', '', 'bank', '', 'crete', 'statement', 'show', '', '2', '3', 'million', 'transfer', '', '', 'merril', 'lynch', 'account', '', '', 'name', '', 'younger', 'papandr', 'show', 'report', '', 'letter', '', '', 'new', 'york', 'lawyer', 'sai', '', '', '', 'record', '', 'merril', 'lynch', '', '', '', 'transfer', 'koskota', 'parent', 'brother', 'wife', '', 'five', 'children', '', '', 'left', 'greec', '', '', 'past', '')))
def __init__(self, parameters): self.index_ = Index(parameters) self.stop_words = set(nltk.corpus.stopwords.words('english'))
class Neighborhood: def __init__(self, word2vec_, parameters): self.word2vec_model = word2vec_.model self.single_document = SimpleDocument(parameters) self.index_ = Index(parameters) self.stop_words = set(nltk.corpus.stopwords.words('english')) def find_nearest_neighbor_in_a_list(self, unigram, other_unigrams, min_distance, neighbor_size): neighbor = [] if unigram in self.word2vec_model.wv.vocab: for other_unigram in other_unigrams: if len(neighbor) > neighbor_size: break if other_unigram is not unigram and other_unigram not in neighbor and \ other_unigram in self.word2vec_model.wv.vocab: sim = self.word2vec_model.similarity( unigram, other_unigram) if sim > min_distance: neighbor += [other_unigram] return neighbor def find_significant_neighbors(self, doc_words, min_distance, neighbor_size): significant_neighbors = [] for other_unigram in doc_words: if other_unigram in self.word2vec_model.wv.vocab: neighbor = self.find_nearest_neighbor_in_a_list( other_unigram, doc_words, min_distance, neighbor_size) if len(neighbor) == neighbor_size: significant_neighbors += [neighbor] significant_neighbors = [ list(x) for x in set(tuple(x) for x in significant_neighbors) ] return significant_neighbors @staticmethod def merge_close_neighbors(neighbors, minimum_merge_intersection): merged_neighbors = [] i = 0 while i < len(neighbors): merged_neighbors += [set(neighbors[i])] j = i + 1 while j < len(neighbors): neighbor_intersection = merged_neighbors[i].intersection( neighbors[j]) if len(neighbor_intersection) >= minimum_merge_intersection: merged_neighbors[i] = set(merged_neighbors[i]).union( neighbors[j]) del neighbors[j] else: j += 1 i += 1 return merged_neighbors def find_significant_merged_neighbors(self, doc_words, min_distance, neighbor_size, minimum_merge_intersection): significant_neighbors = self.find_significant_neighbors( doc_words, min_distance, neighbor_size) significant_merged_neighbors = self.merge_close_neighbors( significant_neighbors, minimum_merge_intersection) return significant_merged_neighbors def remove_stopwords_neighbors(self, neighbors, max_stop_words): i = 0 while i < len(neighbors): neighbor_stop_words_intersection = set(neighbors[i]).intersection( set(self.stop_words)) if len(neighbor_stop_words_intersection) >= max_stop_words: del neighbors[i] else: for a in neighbors[i].copy(): if a in self.stop_words: neighbors[i].remove(a) i += 1 return neighbors def remove_stemmed_similar_words_neighbors(self, neighbors): for k in range(len(neighbors)): neighbor_ = list(neighbors[k]) neighbors[k] = set( self.remove_stemmed_similar_words_list(neighbor_)) return neighbors def remove_stemmed_similar_words_list(self, l): i = 0 while i < len(l): j = i + 1 while j < len(l): if self.index_.check_if_have_same_stem(l[i], l[j]): del l[j] else: j += 1 i += 1 return l def find_significant_pruned_neighbors(self, doc_words, min_distance, neighbor_size, minimum_merge_intersection, max_stop_words): doc_words = list(set(doc_words)) significant_neighbors = \ self.find_significant_merged_neighbors(doc_words, min_distance, neighbor_size, minimum_merge_intersection) significant_neighbors = self.remove_stopwords_neighbors( significant_neighbors, max_stop_words) significant_neighbors = self.remove_stemmed_similar_words_neighbors( significant_neighbors) return significant_neighbors def find_significant_pruned_neighbors_in_doc(self, doc_file_name, min_distance, neighbor_size, minimum_merge_intersection, max_stop_words): doc_words = self.single_document.get_words(doc_file_name) significant_neighbors = self.find_significant_pruned_neighbors( doc_words, min_distance, neighbor_size, minimum_merge_intersection, max_stop_words) return significant_neighbors def find_significant_neighbors_weight(self, doc_words, significant_neighbors_ind): significant_neighbors_weight = dict() for ind, neighbor in list(significant_neighbors_ind.items()): significant_neighbors_weight[ind] = np.mean( [self.index_.tfidf(term, doc_words) for term in neighbor]) return significant_neighbors_weight @staticmethod def sort_significant_neighbors(significant_neighbors_weight, significant_neighbors_ind): sorted_w = sorted(significant_neighbors_weight.items(), key=operator.itemgetter(1), reverse=True) return [(significant_neighbors_ind[k], v) for (k, v) in sorted_w] @staticmethod def index_neighbors(neighbors): return {ind: neighbor for ind, neighbor in enumerate(neighbors)} def run(self, doc_file_name, min_distance, neighbor_size, minimum_merge_intersection, max_stop_words): doc_words = self.single_document.get_words(doc_file_name) print("doc_words length =", len(doc_words)) significant_neighbors = self.find_significant_pruned_neighbors( doc_words, min_distance, neighbor_size, minimum_merge_intersection, max_stop_words) print("significant_neighbors length =", len(significant_neighbors)) significant_neighbors_ind = self.index_neighbors(significant_neighbors) print("significant_neighbors_ind length =", len(significant_neighbors_ind)) significant_neighbors_weight = self.find_significant_neighbors_weight( doc_words, significant_neighbors_ind) print("significant_neighbors_weight length =", len(significant_neighbors_weight)) sorted_significant_neighbors = self.sort_significant_neighbors( significant_neighbors_weight, significant_neighbors_ind) print("sorted_significant_neighbors length =", len(sorted_significant_neighbors)) return sorted_significant_neighbors