コード例 #1
0
    def test_idf(self):
        self.assertEqual(self.index_.idf("dog"), 1.0986122886681098)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.document_count("first"), 0)
コード例 #2
0
    def test_document_count(self):
        self.assertEqual(self.index_.document_count("dog"), 1)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.document_count("emotional"), 2973)
コード例 #3
0
 def test_index_with_xml_libraries(self):
     xml_libs = os.path.join(
         env.RESOURCES_DIR,
         'library'
     )
     db_dir_with_xml = os.path.join(
         env.RESULTS_DIR,
         'db_dir_with_xml')
     scanner = Scanner(xml_libs)
     scanner.scan(
         self.suite_dir,
         'robot',
         db_dir_with_xml
     )
     index = Index(db_dir_with_xml, self.index_dir, self.xml_libs)
     index.index_consturctor(self.resource_a_table_name)
     files = os.listdir(self.index_dir)
     self.assertEqual(len(files), 1)
     with open(os.path.join(self.index_dir, files[0])) as f:
         data = json.load(f)
     self.assertTrue(
         any(kw[2] == 'SwingLibrary' for kw in data['keywords'])
     )
     self.assertTrue(
         any(kw[0] == 'Add Table Cell Selection' for kw in data['keywords'])
     )
     self.assertTrue(
         any(kw[0] == 'Select From Popup Menu' for kw in data['keywords'])
     )
コード例 #4
0
    def test_term_count(self):
        self.assertEqual(self.index_.term_count("dog"), 2)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.term_count("emotional"), 3515)
コード例 #5
0
ファイル: hide1.py プロジェクト: loinly/TextStego
 def info(self, fi='', pagenum=100):
     info = FileUtil.readfile(fi)
     keywords = PreDeal.seg(info)
     # 1. 关键词提取
     keys = jieba.analyse.textrank(info,
                                   topK=10,
                                   withWeight=False,
                                   allowPOS=('ns', 'n', 'vn', 'v'))
     # 2. 调用搜索引擎爬取相关网页
     # 2.1 抓取链接
     spider_link = SpiderLink(keys, self.root)
     spider_link.crawl(pagenum)
     # 2.2 抓取内容
     filename = '_'.join(keys) + '.html'
     spider_to = SpiderTo(filename)
     spider_to.crawl()
     # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合
     p = PreDeal()
     filepath = os.path.join(config.spidertext, '_'.join(keys))
     prepath = os.path.join(config.prepapath, '_'.join(keys))
     p.savetexts(filepath=filepath, prepath=prepath)
     # 4. 构建索引, 并检索,得到包含关键词信息的网页
     # 4.1 索引构建
     indexpath = os.path.join(config.indexpath, '_'.join(keys))
     idx = Index()
     idx.build(datapath=prepath, indexpath=indexpath)
     search = Search1(filename=fi, pindexp=indexpath)
     # 4.2 搜索并保存
     info_k = keywords[:]
     num = search.retrieve(keywords=info_k)
     return keywords, num
コード例 #6
0
ファイル: gensim_corpus.py プロジェクト: wsgan001/SWDM
 def __init__(self, parameters):
     self.stop_words = set(nltk.corpus.stopwords.words('english'))
     self.tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
     self.parameters = parameters
     self.index_ = Index(self.parameters)
     self.store_collection_if_not_exists()
     input_ = self.parameters.params["lda"]["file_name"]
     super().__init__(input_)
コード例 #7
0
    def test_check_if_exists_in_index(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertTrue(self.index_.check_if_exists_in_index("emotional"))
        self.assertFalse(self.index_.check_if_exists_in_index("first"))
        self.assertFalse(self.index_.check_if_exists_in_index("included"))
        self.assertTrue(self.index_.check_if_exists_in_index("includes"))
コード例 #8
0
ファイル: run_index.py プロジェクト: samkong110/Data
def index_single(db_path, db_table, index_path, module_search_path,
                 libs_in_xml):
    for path_ in module_search_path:
        sys.path.append(path_)
    if not path.exists(index_path):
        makedirs(index_path)
    index = Index(db_path=db_path,
                  index_path=index_path,
                  xml_libraries=libs_in_xml)
    index.index_consturctor(table=db_table)
コード例 #9
0
    def test_tf(self):
        doc_words = SimpleDocument(self.parameters).get_words(
            "../configs/others/pride_and_prejudice_wiki.txt")

        self.assertEqual(self.index_.tf("dog", doc_words), 0.5)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.tf("emotional", doc_words), 0.5)
コード例 #10
0
 def setUp(self):
     self.index_dir = os.path.join(
         env.RESULTS_DIR,
         'index_dir',
     )
     if os.path.exists(self.index_dir):
         while os.path.exists(self.index_dir):
             shutil.rmtree(self.index_dir)
             sleep(0.1)
     os.makedirs(self.index_dir)
     self.index = Index(self.db_dir, self.index_dir)
コード例 #11
0
    def test_tfidf(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        doc_words = SimpleDocument(self.parameters).get_words(
            "../configs/others/pride_and_prejudice_wiki.txt")
        tfidf_1 = self.index_.tfidf('emotional', doc_words)
        print(tfidf_1, file=sys.stderr)
        self.assertEqual(tfidf_1, 2.0455597255490345)

        with self.assertRaises(Exception) as context:
            self.index_.tfidf('is', doc_words)
        self.assertTrue(
            'unigram "is" not exist. Probably was a stopword in indexing.' in
            str(context.exception))
コード例 #12
0
 def test_obtain_term_ids_of_a_document(self):
     self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
     self.index_ = Index(self.parameters)
     res = self.index_.obtain_term_ids_of_a_document(1)
     self.assertEqual(
         res,
         ('AP881107-0001',
          (147, 771, 0, 78064, 26, 2828, 1283, 92, 126, 147, 175009, 159395,
           771, 55, 0, 0, 2362, 26, 2828, 919, 0, 0, 115, 8, 461, 1624,
           1826, 0, 35, 693, 1198, 0, 195412, 0, 724, 430, 621, 340, 0, 771,
           0, 1502, 20649, 4327, 1620, 9, 247, 0, 0, 866, 0, 643, 0, 2828,
           415, 101374, 1289, 2015, 276, 1246, 0, 24, 29, 586, 0, 272, 0,
           856, 0, 101374, 1826, 2153, 0, 174, 693, 0, 195412, 0, 0, 158999,
           1037, 0, 117013, 137162, 123, 157, 0, 4415, 159395, 0, 0, 2262,
           0, 0, 56, 0, 101374, 251, 0, 0, 189, 101374, 0, 569, 0, 332,
           3095, 1873, 0, 2974, 0, 0, 13, 63630, 0, 485, 461, 91464, 0, 0,
           91, 0, 50405, 0, 0, 156, 159395, 0, 690, 0, 347, 0, 24, 4049, 0,
           101374, 0, 0, 0, 771, 0, 0, 92, 126, 690, 131907, 609, 0, 56, 0,
           88, 2222, 0, 0, 1624, 0, 160974, 0, 9, 0, 0, 436, 0, 2362, 273,
           0, 774, 1620, 13, 0, 263, 0, 887, 339, 176, 0, 0, 0, 91, 0,
           50405, 0, 1620, 0, 0, 0, 289, 0, 1202, 101374, 0, 254, 0, 4543,
           8, 193, 91, 0, 979, 3597, 0, 3095, 0, 791, 0, 2768, 937, 0, 0,
           264, 91, 0, 0, 461, 2464, 0, 9, 0, 0, 1333, 2198, 45622, 0, 4433,
           0, 1624, 0, 678, 0, 0, 0, 0, 3790, 0, 40, 0, 0, 118, 135313,
           1620, 727, 136295, 0, 0, 0, 3408, 0, 2362, 6, 0, 245, 0, 0, 494,
           0, 415, 101374, 0, 2042, 435, 0, 0, 0, 0, 0, 586, 0, 347, 1461,
           0, 0, 116, 0, 0, 354, 0, 0, 9, 145130, 0, 48310, 120515, 51, 0,
           0, 1, 956, 540, 430, 32, 32, 0, 0, 0, 221, 340, 0, 9, 771,
           101374, 0, 1431, 897, 123, 0, 248, 0, 0, 91, 0, 50405, 0, 919, 0,
           114, 667, 0, 70387, 0, 62918, 0, 0, 101374, 261, 0, 0, 311, 212,
           0, 0, 61, 0, 4322, 767, 144897, 0, 62, 0, 0, 91, 0, 50405, 0,
           375, 8, 546, 0, 0, 26, 3527, 543, 3657, 3224, 0, 0, 0, 2223, 0,
           533, 0, 1019, 367, 0, 674, 0, 165, 101374, 0, 0, 436, 0, 4322,
           767, 144897, 0, 62, 0, 91, 0, 0, 445, 8, 0, 0, 550, 0, 0, 26, 91,
           3533, 1512, 285, 0, 0, 543, 0, 0, 0, 0, 0, 192, 0, 0, 1798,
           101374, 0, 328, 0, 26, 0, 2362, 45369, 1390, 0, 156, 167, 0, 0,
           91, 0, 50405, 0, 791, 0, 28, 0, 0, 264, 4415, 1977, 0, 187, 15,
           0, 0, 550, 1483, 273, 0, 247, 101374, 887, 0, 91, 0, 50405, 168,
           0, 303, 0, 515, 34, 77704, 0, 156, 23, 701, 273, 114, 1056, 0, 0,
           409, 490, 101374, 771, 897, 899, 0, 248, 0, 77704, 0, 0, 0, 0,
           1502, 599, 463, 147, 415, 4327, 2512, 0, 0, 70387, 0, 91, 0,
           50405, 221, 62, 0, 17, 41, 8, 1548, 0, 0, 3657, 3224, 550, 0, 0,
           171, 0, 2597, 4327, 62, 4, 0, 512, 0, 0, 1, 73, 532, 7, 0, 0, 0,
           192, 0, 3657, 3224, 0, 0, 0, 1548, 101374, 626, 918, 455, 0, 114,
           219, 0, 0, 245, 3095, 0, 0, 333, 0)))
コード例 #13
0
    def test_obtain_text_of_a_document(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)
        res = self.index_.obtain_text_of_a_document(1)
        self.assertEqual(
            res, """
   Public Order Minister Tassos Sehiotis
resigned Monday after a Greek-American banker indicted in a $30
million financial scandal fled the country, apparently aboard a
yacht.
   The conservative opposition immediately demanded the resignation
of Premier Andreas Papandreou's socialist government, claiming it
was staging a cover-up.
   Banker George Koskotas, 34, disappeared Saturday afternoon. A
police officer, speaking on condition of anonymity, said Koskotas
fled abroad on Sunday, apparently by yacht from the seaside village
of Megalo Pefko, 20 miles from Athens.
   Sehiotis said a warrant had been issued for Koskotas' arrest. One
week ago, Koskotas was banned from leaving Greece pending the
outcome of an official enquiry into alleged financial irregularities
at the Bank of Crete, which he controls.
   Sehiotis, whose ministry was responsible for police surveillance
of Koskotas, said he was resigning ``since such (public order
ministry) omissions ... create an issue of political sensitivity.''
   The scandal has shaken the government because of accusations in
Greek newspapers that senior socialist officials were involved in
illegal deals set up by the Bank of Crete.
   The socialists also have been criticized for permitting Koskotas
to build a multi-million dollar banking and media empire in Greece
since 1984 without adequate checks by the central bank on his
financial background.
   The government last week pledged ``absolute clarity'' in
uncovering the scandal and warned there will be ``no pardons' for
members of the ruling Panhellenic Socialist Movement (PASOK) who may
be implicated.
   ``The Greek people are left with the conviction that George
Koskotas was spirited away so that he would not speak. The
responsibility goes all the way to the top of the government
pyramid,'' Constantine Mitsotakis, leader of the New Democracy main
opposition party party, said in a statement demanding the government
resign.
   Koskotas was suspended Oct. 20 as chairman of the Bank of Crete
and indicted on five counts of forgery and embezzlement.
   Last week Koskotas appeared before a district attorney on a
charge of forging documents purporting to show that the Bank of
Crete had $13 million invested with the American brokerage firm
Merrill Lynch.
   He was not detained but given until Nov. 14 to prepare his
defense.
   Koskotas also is accused of forging documents purporting to show
his bank had another $17 million in an account with an American
bank, Irving Trust Corp. Both U.S. firms have said they had no
record of the deposits.
   Koskotas, who holds both American and Greek citizenship, bought a
controlling interest in the Bank of Crete in 1984 after working in
its central Athens branch for six years as an accountant.
   Rival newspapers have claimed Koskotas illegally used Bank of
Crete money to fund his publishing group Grammi, which controls
three daily newspapers, five magazines and a radio station.
   Koskotas resigned Oct. 29 as chairman of Grammi, the day after
the premier's son, Education Minister George Papandreou, denounced
as a forgery a Bank of Crete statement showing a $2.3 million
transfer to a Merrill Lynch account in his name.
   The younger Papandreou showed reporters a letter from a New York
lawyer saying there was no record at Merrill Lynch of such a
transfer.
   Koskotas' parents, brother, wife and five children all have left
Greece during the past week.
""")
コード例 #14
0
ファイル: neighborhood.py プロジェクト: wsgan001/SWDM
 def __init__(self, word2vec_, parameters):
     self.word2vec_model = word2vec_.model
     self.single_document = SimpleDocument(parameters)
     self.index_ = Index(parameters)
     self.stop_words = set(nltk.corpus.stopwords.words('english'))
コード例 #15
0
 def __init__(self, parameters):
     self.parameters = parameters
     self.enchant_dict = enchant.Dict("en_US")
     self.stopwords = stopwords.words('english')
     self.index_ = Index(self.parameters)
コード例 #16
0
import sys
from datetime import datetime

from index.document import IndexDocument
from index.index import Index
from server.server import fields
from index.utils import kendal_tau
from metrics.utils import avg_sd

index = Index()
index.load()

def validate_queries(queries):
    for i, query in enumerate(queries):
        if not query[0] in fields:
            print('{} is not a valid field (in query #{}). Try one of: {}'.format(query[0], i+1, str(fields)))
            sys.exit(1)

def ranking_correlation(queries):
    validate_queries(queries)

    for query in queries:
        ranking_tfidf = index.get_documents_for_query(query[0], query[1], query[2], True)
        ranking_raw = index.get_documents_for_query(query[0], query[1], query[2], False)

        correlation = kendal_tau(ranking_tfidf, ranking_raw)
        print('For query [{}] on field [{}]: {} Kendal Tau correlation. Rankings with {} documents.'.format(query[1], query[0], correlation, query[2]))


def query_response_time(queries):
    validate_queries(queries)
コード例 #17
0
 def __init__(self, parameters):
     self.index_ = Index(parameters)
     self.stop_words = set(nltk.corpus.stopwords.words('english'))
コード例 #18
0
    def setUp(self):
        self.parameters = Parameters()
        self.parameters.params["repo_dir"] = '../index/test_files/index'

        self.index_ = Index(self.parameters)
コード例 #19
0
 def test_obtain_terms_of_a_document(self):
     self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
     self.index_ = Index(self.parameters)
     res = self.index_.obtain_terms_of_a_document(1)
     print(res, file=sys.stderr)
     self.assertEqual(res, ('AP881107-0001', (
         'minist', 'resign', '', 'gree', 'american', 'banker', 'escap',
         'public', 'order', 'minist', 'tasso', 'sehioti', 'resign',
         'mondai', '', '', 'greek', 'american', 'banker', 'indict', '', '',
         '30', 'million', 'financi', 'scandal',
         'fled', '', 'countri', 'appar', 'aboard', '', 'yacht', '',
         'conserv', 'opposit', 'immedi', 'demand', '', 'resign', '',
         'premier', 'andrea', 'papandr', 'socialist', 'govern', 'claim', '',
         '', 'stage', '', 'cover', '', 'banker', 'georg', 'koskota', '34',
         'disappear', 'saturdai', 'afternoon', '', 'polic', 'offic',
         'speak', '', 'condit', '', 'anonym', '', 'koskota', 'fled',
         'abroad', '', 'sundai', 'appar', '', 'yacht', '', '', 'seasid',
         'villag', '', 'megalo', 'pefko', '20', 'mile', '', 'athen',
         'sehioti', '', '', 'warrant', '', '', 'issu', '', 'koskota',
         'arrest', '', '', 'ago', 'koskota', '', 'ban', '', 'leav', 'greec',
         'pend', '', 'outcom', '', '', 'offici', 'enquiri', '', 'alleg',
         'financi', 'irregular', '', '', 'bank', '', 'crete', '', '',
         'control', 'sehioti', '', 'ministri', '', 'respons', '', 'polic',
         'surveil', '', 'koskota', '', '', '', 'resign', '', '', 'public',
         'order', 'ministri', 'omiss', 'creat', '', 'issu', '', 'polit',
         'sensit', '', '', 'scandal', '', 'shaken', '', 'govern', '', '',
         'accus', '', 'greek', 'newspap', '', 'senior', 'socialist',
         'offici', '', 'involv', '', 'illeg', 'deal', 'set', '', '', '',
         'bank', '', 'crete', '', 'socialist', '', '', '', 'critic', '',
         'permit', 'koskota', '', 'build', '', 'multi', 'million', 'dollar',
         'bank', '', 'media', 'empir', '', 'greec', '', '1984', '', 'adequ',
         'check', '', '', 'central', 'bank', '', '', 'financi',
         'background', '', 'govern', '', '', 'pledg', 'absolut', 'clariti',
         '', 'uncov', '', 'scandal', '', 'warn', '', '', '', '', 'pardon',
         '', 'member', '', '', 'rule', 'panhellen', 'socialist', 'movement',
         'pasok', '', '', '', 'implic', '', 'greek', 'peopl', '', 'left',
         '', '', 'convict', '', 'georg', 'koskota', '', 'spirit', 'awai',
         '', '', '', '', '', 'speak', '', 'respons', 'goe', '', '', 'wai',
         '', '', 'top', '', '', 'govern', 'pyramid', '', 'constantin',
         'mitsotaki', 'leader', '', '', 'new', 'democraci', 'main',
         'opposit', 'parti', 'parti', '', '', '', 'statement', 'demand', '',
         'govern', 'resign', 'koskota', '', 'suspend', 'oct', '20', '',
         'chairman', '', '', 'bank', '', 'crete', '', 'indict', '', 'five',
         'count', '', 'forgeri', '', 'embezzl', '', '', 'koskota', 'appear',
         '', '', 'district', 'attornei', '', '', 'charg', '', 'forg',
         'document', 'purport', '', 'show', '', '', 'bank', '', 'crete', '',
         '13', 'million', 'invest', '', '', 'american', 'brokerag', 'firm',
         'merril', 'lynch', '', '', '', 'detain', '', 'given', '', 'nov',
         '14', '', 'prepar', '', 'defens', 'koskota', '', '', 'accus', '',
         'forg', 'document', 'purport', '', 'show', '', 'bank', '', '',
         '17', 'million', '', '', 'account', '', '', 'american', 'bank',
         'irv', 'trust', 'corp', '', '', 'firm', '', '', '', '', '',
         'record', '', '', 'deposit', 'koskota', '', 'hold', '', 'american',
         '', 'greek', 'citizenship', 'bought', '', 'control', 'interest',
         '', '', 'bank', '', 'crete', '', '1984', '', 'work', '', '',
         'central', 'athen', 'branch', '', 'six', 'year', '', '', 'account',
         'rival', 'newspap', '', 'claim', 'koskota', 'illeg', '', 'bank',
         '', 'crete', 'monei', '', 'fund', '', 'publish', 'group', 'grammi',
         '', 'control', 'three', 'daili', 'newspap', 'five', 'magazin', '',
         '', 'radio', 'station', 'koskota', 'resign', 'oct', '29', '',
         'chairman', '', 'grammi', '', '', '', '', 'premier', 'son', 'educ',
         'minist', 'georg', 'papandr', 'denounc', '', '', 'forgeri', '',
         'bank', '', 'crete', 'statement', 'show', '', '2', '3', 'million',
         'transfer', '', '', 'merril', 'lynch', 'account', '', '', 'name',
         '', 'younger', 'papandr', 'show', 'report', '', 'letter', '', '',
         'new', 'york', 'lawyer', 'sai', '', '', '', 'record', '', 'merril',
         'lynch', '', '', '', 'transfer', 'koskota', 'parent', 'brother',
         'wife', '', 'five', 'children', '', '', 'left', 'greec', '', '',
         'past', '')))
コード例 #20
0
 def test_term(self):
     self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
     self.index_ = Index(self.parameters)
     res = self.index_.term(147)
     print(res, file=sys.stderr)
     self.assertEqual(res, 'minist')