Exemple #1
0
 def test_add_file_raises_file_not_found_error(self):
     with mock.patch('model.index.Config', autospec=True, spec_set=True):
         with mock.patch('builtins.open', mock.mock_open()) as m:
             m.side_effect = FileNotFoundError
             index = Index()
             with self.assertRaises(FileNotFoundError):
                 index.add_file('doc1')
Exemple #2
0
 def test_add_file_raises_permission_error(self):
     with mock.patch('model.index.Config', autospec=True, spec_set=True):
         with mock.patch('builtins.open', mock.mock_open()) as m:
             m.side_effect = PermissionError
             index = Index()
             with self.assertRaises(PermissionError):
                 index.add_file('doc1')
Exemple #3
0
 def test_add_file_raises_is_a_directory_error(self):
     with mock.patch('model.index.Config', autospec=True, spec_set=True):
         with mock.patch('builtins.open', mock.mock_open()) as m:
             m.side_effect = IsADirectoryError
             index = Index()
             with self.assertRaises(IsADirectoryError):
                 index.add_file('doc1')
Exemple #4
0
 def test_add_file_with_empty_file_name(self):
     with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config:
         mock_config.return_value.remove_stopwords.return_value = False
         mock_config.return_value.language.return_value = 'english'
         mock_config.return_value.use_stemming.return_value = False
         with mock.patch('builtins.open', mock.mock_open()) as m:
             m.side_effect = FileNotFoundError
             index = Index()
             with self.assertRaises(FileNotFoundError):
                 index.add_file('')
Exemple #5
0
 def test_add_file_with_stop_words_and_stemming_enabled(self):
     data = 'this is some data that needs stemming ;continue hello world'
     expected = defaultdict(list, {'this': [1], 'some': [1], 'data': [1], 'that': [1], 'need': [1], 'stem': [1], 'continu': [1], 'hello': [1], 'world': [1]})
     with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config:
         mock_config.return_value.remove_stopwords.return_value = True
         mock_config.return_value.language.return_value = 'english'
         mock_config.return_value.use_stemming.return_value = True
         with mock.patch('builtins.open', mock.mock_open(read_data=data)) as m:
             index = Index()
             index.add_file('doc1')
         assert index._index == expected
Exemple #6
0
 def test_add_file_with_remove_stop_word_enabled(self):
     data = 'at some I am here before as so me'
     expected = defaultdict(list, {'some': [1], 'here': [1], 'before': [1]})
     with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config:
         mock_config.return_value.remove_stopwords.return_value = True
         mock_config.return_value.language.return_value = 'english'
         mock_config.return_value.use_stemming.return_value = False
         with mock.patch('builtins.open', mock.mock_open(read_data=data)) as m:
             index = Index()
             index.add_file('doc1')
         assert index._index == expected
Exemple #7
0
 def test_add_file_with_identic_words(self):
     data = 'data data data'
     expected = {'data': [1]}
     with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config:
         mock_config.return_value.remove_stopwords.return_value = False
         mock_config.return_value.language.return_value = 'english'
         mock_config.return_value.use_stemming.return_value = False
         with mock.patch('builtins.open', mock.mock_open(read_data=data)) as m:
             index = Index()
             index.add_file('doc1')
         assert index._index == expected
Exemple #8
0
 def test_index_structure_with_stemming_disabled(self):
     data = 'cycling continuos continue'
     expected = {'cycling': [1], 'continuos': [1], 'continue': [1]}
     with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config:
         mock_config.return_value.remove_stopwords.return_value = False
         mock_config.return_value.language.return_value = 'english'
         mock_config.return_value.use_stemming.return_value = False
         with mock.patch('builtins.open', mock.mock_open(read_data=data)):
             index = Index()
             index.add_file('doc1')
         assert index._index == expected
Exemple #9
0
 def test_empty_query(self):
     index = defaultdict(list, {'data': [1, 3], 'some':[1, 2], 'hello': [1], 'world': [3]})
     files = ['doc1', 'doc2', 'doc3', 'doc4']
     query = ''
     expected = None
     with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config:
         mock_config.return_value.remove_stopwords.return_value = False
         mock_config.return_value.language.return_value = 'english'
         mock_config.return_value.use_stemming.return_value = False
         idx = Index()
         idx._index = index
         idx._files = files
         assert idx.get_result_for_query(query) == expected
Exemple #10
0
 def test_wrong_query(self):
     index = defaultdict(list, {'data': [1, 3], 'some':[1, 2], 'hello': [1], 'world': [3]})
     files = ['doc1', 'doc2', 'doc3', 'doc4']
     query = 'data. some hello this is wrong query !'
     with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config:
         mock_config.return_value.remove_stopwords.return_value = False
         mock_config.return_value.language.return_value = 'english'
         mock_config.return_value.use_stemming.return_value = False
         idx = Index()
         idx._index = index
         idx._files = files
         with self.assertRaises(ValueError):
             result = idx.get_result_for_query(query)
Exemple #11
0
 def test_query_with_stemming_enabled(self):
     index = defaultdict(list, {'continuo': [1, 3], 'cycl':[1, 2], 'hello': [1], 'world': [3]})
     files = ['doc1', 'doc2', 'doc3', 'doc4']
     query = 'continuos && cycling'
     expected = ['doc1']
     with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config:
         mock_config.return_value.remove_stopwords.return_value = False
         mock_config.return_value.language.return_value = 'english'
         mock_config.return_value.use_stemming.return_value = True
         idx = Index()
         idx._index = index
         idx._files = files
         assert idx.get_result_for_query(query) == expected
Exemple #12
0
 def test_add_file_with_wrong_type(self):
     with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config:
         mock_config.return_value.remove_stopwords.return_value = False
         mock_config.return_value.language.return_value = 'english'
         mock_config.return_value.use_stemming.return_value = False
         with mock.patch('builtins.open', mock.mock_open()) as m:
             index = Index()
             with self.assertRaises(ValueError):
                 index.add_file(None)
             with self.assertRaises(ValueError):
                 index.add_file([])
             with self.assertRaises(ValueError):
                 index.add_file(True)
             with self.assertRaises(ValueError):
                 index.add_file({})
Exemple #13
0
 def test_add_file_correct_input_with_more_files(self):
     with mock.patch('model.index.Config', autospec=True, spec_set=True):
         with mock.patch('builtins.open', mock.mock_open()) as m:
             index = Index()
             index.add_file('doc1')
             index.add_file('doc2')
             index.add_file('doc3')
             with self.assertRaises(IndexError):
                 index.add_file('doc3')
         assert index._files == ['doc1', 'doc2', 'doc3']
Exemple #14
0
    def assign_indexes(self, publications):
        pub_by_id = {}

        for pub in publications:
            e = list(Index.find_by_type(pub.indexes, 'WOS'))
            if len(e) > 0:
                continue

            ut = list(Identifier.find_by_type(pub.identifiers, 'WOK'))
            if len(ut) == 0:
                continue
            ut = ut[0].value

            pub_by_id[ut] = pub

        editions = self._find_editions(pub_by_id.keys())
        for ut, edition in editions.iteritems():
            pub_by_id[ut].indexes.append(Index(edition, type='WOS'))
Exemple #15
0
def create_index(name, desc, final_photo, projectName):
    """Creates a Index Card"""

    project = get_project_by_name(projectName)
    index = Index(project_id=project.project_id,
                  name=name,
                  desc=desc,
                  index_url=final_photo)
    db.session.add(index)
    db.session.commit()
Exemple #16
0
 def test_add_file_raises_index_error(self):
     with mock.patch('model.index.Config', autospec=True, spec_set=True):
         with mock.patch('builtins.open', mock.mock_open()):
             index = Index()
             index.add_file('doc1')
             with self.assertRaises(IndexError):
                 index.add_file('doc1')
Exemple #17
0
def main():
    # create a new index
    try:
        index = Index()
    except ValueError as e:
        print(e)

    try:
        with open('files.txt') as file:
            for line in file:
                if not line.strip() or line.strip().startswith('#'):
                    continue
                add_file_to_index(line.split()[0], index)
    except FileNotFoundError:
        print('files.txt was not found, continuing with manual file addition.')
        prompt = 'File to add to index (or simply press enter for query): '
        while (file := input(prompt)) != "":
            add_file_to_index(file, index)
Exemple #18
0
 def test_add_file_correct_index_structure_with_more_files(self):
     data1 = 'some data'
     data2 = 'data here'
     expected = {'some': [1], 'data': [1, 2], 'here': [2]}
     with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config:
         mock_config.return_value.remove_stopwords.return_value = False
         mock_config.return_value.language.return_value = 'english'
         mock_config.return_value.use_stemming.return_value = False
         index = Index()
         with mock.patch('builtins.open', mock.mock_open(read_data=data1)) as m:
             index.add_file('document1')
         with mock.patch('builtins.open', mock.mock_open(read_data=data2)) as m:
             index.add_file('document2')
         assert index._index == expected
Exemple #19
0
    def _parse_csv(self, content, encoding='UTF-8'):
        csv = unicodecsv.DictReader(strip_bom(content).splitlines(),
                                    encoding=encoding)

        def empty_to_none(s):
            if s == None:
                return None
            s = s.strip()
            if len(s) == 0:
                return None
            return s

        def list_remove_empty(l):
            r = []
            for x in l:
                v = empty_to_none(x)
                if v:
                    r.append(v)
            return r

        def to_num(x):
            x = x.strip()
            if len(x) == 0:
                return 0
            return int(x)

        for line in csv:
            if line['Authors'] == '[No author name available]':
                authors = []
            else:
                # (mrshu): SCOPUS sa rozhodol oddelovat ako priezvyska, tak aj
                # jednotlive mena autorov ciarkov. Toto robi problemy, preto
                # preprocessujeme zoznam autorov, ktory vyzera napriklad
                #
                # Brejová, B., Brown, D.G., Li, M., Vinař, T.
                #
                # najdeme, konce celych mien, a ciarku v tomto pripade nahradime
                # bodkociarkou. Nasledne potom funkcii, ktora mena autorov spracovava
                # dame vediet, ze je ako separator pouzita bodkociarka.
                line['Authors'] = re.sub(r'\.,', ';', line['Authors'])
                authors = Author.parse_sn_first_list(line['Authors'],
                                                     separator=u';')
            pub = Publication(line['Title'], authors, to_num(line['Year']))
            source_title = empty_to_none(line['Source title'])
            if source_title:
                source_title, replacements = re.subn(
                    r' \(including subseries [^)]+\)', '', source_title)
                source_title = source_title.strip()
                if replacements:
                    pub.series = source_title
                else:
                    pub.published_in = source_title
            pub.volume = empty_to_none(line['Volume'])
            pub.issue = empty_to_none(line['Issue'])
            pub.pages = make_page_range(empty_to_none(line['Page start']),
                                        empty_to_none(line['Page end']))

            # (mrshu): z dovodu, ktory nedokazem pochopit teraz SCOPUS vracia cosi
            # ako 'DOILink', kde da dohromady tieto dva fieldy. Nepodarilo sa mi
            # prist na to ako to spravit rozumnejsie, tento hack to aspon rozparsuje
            splits = line['DOILink'].split('"')
            if len(splits) > 1:
                line['Link'] = splits[1]
                line['DOI'] = splits[0]
            else:
                line['Link'] = splits[0]
                line['DOI'] = None

            pub.times_cited = empty_to_none(line['Cited by'])
            pub.article_no = empty_to_none(line['Art. No.'])
            pub.publisher = empty_to_none(line['Publisher'])
            url = empty_to_none(line['Link'])

            if url:
                pub.source_urls.append(
                    URL(url, type='SCOPUS', description='SCOPUS'))
                url_parts = urlparse(url)
                url_query = parse_qs(url_parts.query)
                if 'eid' in url_query and len:
                    pub.identifiers.append(
                        Identifier(url_query['eid'][0], type='SCOPUS'))

            for issn in list_remove_empty(line['ISSN'].split(u';')):
                pub.identifiers.append(Identifier(issn, type='ISSN'))

            for isbn in list_remove_empty(line['ISBN'].split(u';')):
                pub.identifiers.append(Identifier(isbn, type='ISBN'))

            doi = empty_to_none(line['DOI'])
            if doi:
                pub.identifiers.append(Identifier(doi, type='DOI'))

            pub.indexes.append(Index('SCOPUS', type='SCOPUS'))

            yield pub
Exemple #20
0
 def test_add_file_correct_input_one_file(self):
     with mock.patch('model.index.Config', autospec=True, spec_set=True):
         with mock.patch('builtins.open', mock.mock_open()) as m:
             index = Index()
             index.add_file('document')
         m.assert_called_once_with('document')
Exemple #21
0
 def test_index_raises_value_error_when_wrong_config_file(self, mock_config):
     mock_config.side_effect = ValueError
     with self.assertRaises(ValueError):
         index = Index()
Exemple #22
0
class Application(object):
    config = Config

    def __init__(self):
        self.index = Index()
        self.__init_db()    
        self.queue = TaskQueue(application=self,
                               num_workers=self.config.WORKER_THREADS)
        self.search_module = Search(application=self)

    def __init_db(self):
        dir_path = os.path.dirname(os.path.realpath(__file__))
        print(dir_path)
        self.db_path = "sqlite:///%s/datastore/%s" % \
            (dir_path, self.config.DB)
        self.engine = create_engine(self.db_path)
        Base.metadata.create_all(self.engine)
        self.Session = sessionmaker(bind=self.engine)

    def init(self):
        pages_added = 0
        with session_scope(self.Session) as session:
            if session.query(Page).count() == 0:
                for i in self.config.ROOT_NODES:
                    page = Page(url=i)
                    session.add(page)
                    session.commit()
                    self.queue.add_page(page.page_id)
                    pages_added += 1
            else:
                for page in session.query(Page).filter(Page.state != Page.State.PROCESSED).all():
                    self.queue.add_page(page.page_id)
                    pages_added += 1
            session.expunge_all()

        if pages_added != 0:
            print('No movie data in our system. We need to scrape IMDB for data...')
            print('Started pipeline! Added %s root pages to processing queue' % pages_added)
            self.queue.join()
            print("Finished processing!")
        self.search_module.build_index()

    def search(self, search_str):
        return self.search_module.search(search_str)

    def rebuild(self):
        self.clear()
        self.init()

    def clear(self):
        self.index.reset()
        meta = Base.metadata
        with session_scope(self.Session) as session:
            for table in reversed(meta.sorted_tables):
                session.execute(table.delete())
            session.commit()
        Base.metadata.create_all(self.engine)

        dir_path = os.path.dirname(os.path.realpath(__file__))
        folder = "%s/datastore/object_store" % dir_path
        for cur_file in os.listdir(folder):
            file_path = os.path.join(folder, cur_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path): shutil.rmtree(file_path)
            except Exception as e:
                print(e)
Exemple #23
0
 def __init__(self):
     self.index = Index()
     self.__init_db()    
     self.queue = TaskQueue(application=self,
                            num_workers=self.config.WORKER_THREADS)
     self.search_module = Search(application=self)
Exemple #24
0
    def entries_to_publications(self, entries):
        """Prerobi data zo SCOPUS json reprezentacie na internu Publication."""

        def empty_to_none(s):
            if s is None:
                return None
            s = s.strip()
            if len(s) == 0:
                return None
            return s

        def exists_to_none(d, key):
            if key in d:
                if type(d[key]) is list:
                    return [empty_to_none(x['$']) for x in d[key]]
                else:
                    return empty_to_none(d[key])
            else:
                return None

        def append_identifier(d, key, obj, type):
            ids = exists_to_none(d, key)
            if ids:
                if isinstance(ids, list):
                    for id in ids:
                        obj.identifiers.append(Identifier(id, type=type))
                else:
                    obj.identifiers.append(Identifier(ids, type=type))

        for entry in entries:
            author_count = int(entry['author-count']['$'])
            if author_count == 0:
                authors = []
            else:
                authors = self.authors_from_json(entry['author'])

            year = empty_to_none(entry['prism:coverDate'])
            if year:
                year = int(year.split('-')[0])
            pub = Publication(empty_to_none(entry['dc:title']), authors, year)
            pub.times_cited = empty_to_none(entry['citedby-count'])

            source_title = exists_to_none(entry, 'prism:publicationName')
            if source_title:
                source_title, replacements = re.subn(INCLUDING_RE,
                                                     '',
                                                     source_title)
                source_title = source_title.strip()
                if replacements:
                    pub.series = source_title
                else:
                    pub.published_in = source_title

            url = self.find_next_url(entry['link'], ref='scopus')
            pub.source_urls.append(URL(url,
                                       type='SCOPUS',
                                       description='SCOPUS'))

            citedby_url = self.find_next_url(entry['link'],
                                             ref='scopus-citedby')
            if citedby_url is not None:
                pub.cite_urls.append(URL(citedby_url,
                                         type='SCOPUS',
                                         description='SCOPUS'))

            pub.pages = exists_to_none(entry, 'prism:pageRange')
            pub.volume = exists_to_none(entry, 'prism:volume')
            pub.issue = exists_to_none(entry, 'prism:issueIdentifier')
            pub.pages = exists_to_none(entry, 'prism:pageRange')

            append_identifier(entry, 'prism:doi', pub, 'DOI')
            append_identifier(entry, 'prism:isbn', pub, 'ISBN')
            append_identifier(entry, 'prism:issn', pub, 'ISSN')
            append_identifier(entry, 'eid', pub, 'SCOPUS')

            pub.indexes.append(Index('SCOPUS', type='SCOPUS'))

            yield pub