Example #1
0
    ctx = JobContext("Test Filename")

    parser = register_parser.parser_dict['arxiv.org']
    sr = SearchResult(None, "http://arxiv.org/abs/1312.6680")
    #sr = SearchResult(None, "  http://arxiv.org/abs/1404.3610")

    #parser = register_parser.parser_dict['dl.acm.org']
    #url = "http://dl.acm.org/citation.cfm?id=1859761"  # twitter
    #url = "http://dl.acm.org/citation.cfm?id=996342"    # SIFT # Large Number of cited
    #url = "http://dl.acm.org/citation.cfm?id=2366157"  # big
    #url = "http://dl.acm.org/citation.cfm?id=1656278"  # Weka
    #sr = SearchResult(None, url)

    #parser = register_parser.parser_dict['ieeexplore.ieee.org']
    #sr = SearchResult(None, "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=00726791")
    #sr = SearchResult(None, "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4244529")


    #parser = register_parser.parser_dict['sciencedirect.com']
    #url = "http://www.sciencedirect.com/science/article/pii/S1570870513000073"
    #sr = SearchResult(None, url)


    #params = parser.fetch_info(ctx, sr)
    #print params
    data = parser.download(sr)

    print ctx.title
    if ukconfig.USE_DB and ctx.success:
        pid = new_paper(ctx)
Example #2
0
def handle_title_query(query):
    query = title_beautify(query)
    log_info("Get title query: {0}".format(query))

    #starts search
    res = search_startswith(query)  # and the idf is large
    if res:
        log_info("Found {0} results in db: {1}".format(
            len(res), str([x['_id'] for x in res])))
        return res
    # similar search
    res = similar_search(query)
    if res:
        log_info(u"Found similar results in db: {0}".format(res['_id']))
        return [res]

    # search on web
    searchers = searcher.register_searcher.get_searcher_list()
    parsers = fetcher.register_parser.get_parser_list()
    ctx = JobContext(query)

    args = zip(searchers, [ctx] * len(searchers))
    pool = Pool()
    async_results = [pool.apply_async(searcher_run, arg) for arg in args]

    # Search and get all the results item
    all_search_results = []
    for s in async_results:
        s = s.get(ukconfig.PYTHON_POOL_TIMEOUT)
        if s is None:
            continue
        srs = s['results']

        # try search database with updated title
        try:
            updated_title = s['ctx_update']['title']
        except KeyError:
            pass
        else:
            if updated_title != query:
                query = updated_title
                res = search_exact(query)
                if res:
                    log_info("Found {0} results in db: {1}".format(
                        len(res), str([x['_id'] for x in res])))
                    return res
        all_search_results.extend(srs)

        meta = s.get('ctx_update')
        if meta:
            log_info('Meat update from searcher: {0}'.format(str(meta.keys())))
            ctx.update_meta_dict(meta)
    pool.close()
    pool.terminate()

    # Analyse each result and try to parse info
    download_candidates = []
    parser_used = set()
    found = False
    for sr in all_search_results:
        for parser in parsers:
            if parser.can_handle(sr):
                download_candidates.append((parser, sr))
                if ctx.need_field(parser.support_meta_field):
                    # Already tried this fetcher
                    if not parser.repeatable and \
                            parser.name in parser_used:
                        continue
                    else:
                        parser_used.add(parser.name)

                    succ = parser.fetch_info(ctx, sr)
                    if not succ:
                        continue
                    found = True
                    if ctx.existing is not None:
                        log_info("Found {0} results in db".format(
                            len(ctx.existing)))
                        return [ctx.existing]

    # no metadata or downloadable source found
    if not found and len(download_candidates) == 0:
        return None
    # Save data, return data and start downloading
    try:
        pid = new_paper(ctx)
        ret = [{
            '_id': pid,
            'title': ctx.title,
            'view_cnt': 1,
            'download_cnt': 0
        }]
        ret[0].update(ctx.meta)

        progress_dict[pid] = 0.0
        if len(download_candidates) > 0:
            thread = Thread(target=start_download,
                            args=(download_candidates, ctx, pid))
            thread.start()
        return ret
    except:
        log_exc("Failed to save to db")
Example #3
0
    if len(sys.argv) == 2:
        ukconfig.USE_DB = True
    ctx = JobContext("Test Filename")

    parser = register_parser.parser_dict['arxiv.org']
    sr = SearchResult(None, "http://arxiv.org/abs/1312.6680")
    #sr = SearchResult(None, "  http://arxiv.org/abs/1404.3610")

    #parser = register_parser.parser_dict['dl.acm.org']
    #url = "http://dl.acm.org/citation.cfm?id=1859761"  # twitter
    #url = "http://dl.acm.org/citation.cfm?id=996342"    # SIFT # Large Number of cited
    #url = "http://dl.acm.org/citation.cfm?id=2366157"  # big
    #url = "http://dl.acm.org/citation.cfm?id=1656278"  # Weka
    #sr = SearchResult(None, url)

    #parser = register_parser.parser_dict['ieeexplore.ieee.org']
    #sr = SearchResult(None, "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=00726791")
    #sr = SearchResult(None, "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4244529")

    #parser = register_parser.parser_dict['sciencedirect.com']
    #url = "http://www.sciencedirect.com/science/article/pii/S1570870513000073"
    #sr = SearchResult(None, url)

    #params = parser.fetch_info(ctx, sr)
    #print params
    data = parser.download(sr)

    print ctx.title
    if ukconfig.USE_DB and ctx.success:
        pid = new_paper(ctx)
Example #4
0
def handle_title_query(query):
    query = title_beautify(query)
    log_info("Get title query: {0}".format(query))

     #starts search
    res = search_startswith(query) # and the idf is large
    if res:
        log_info("Found {0} results in db: {1}".format(
            len(res), str([x['_id'] for x in res])))
        return res
    # similar search
    res = similar_search(query)
    if res:
        log_info(u"Found similar results in db: {0}".format(res['_id']))
        return [res]

    # search on web
    searchers = searcher.register_searcher.get_searcher_list()
    parsers = fetcher.register_parser.get_parser_list()
    ctx = JobContext(query)

    args = zip(searchers, [ctx] * len(searchers))
    pool = Pool()
    async_results = [pool.apply_async(searcher_run, arg) for arg in args]

    # Search and get all the results item
    all_search_results = []
    for s in async_results:
        s = s.get(ukconfig.PYTHON_POOL_TIMEOUT)
        if s is None:
            continue
        srs = s['results']

        # try search database with updated title
        try:
            updated_title = s['ctx_update']['title']
        except KeyError:
            pass
        else:
            if updated_title != query:
                query = updated_title
                res = search_exact(query)
                if res:
                    log_info("Found {0} results in db: {1}".format(
                        len(res), str([x['_id'] for x in res])))
                    return res
        all_search_results.extend(srs)

        meta = s.get('ctx_update')
        if meta:
            log_info('Meat update from searcher: {0}'.format(str(meta.keys())))
            ctx.update_meta_dict(meta)
    pool.close()
    pool.terminate()

    # Analyse each result and try to parse info
    download_candidates = []
    parser_used = set()
    found = False
    for sr in all_search_results:
        for parser in parsers:
            if parser.can_handle(sr):
                download_candidates.append((parser, sr))
                if ctx.need_field(parser.support_meta_field):
                    # Already tried this fetcher
                    if not parser.repeatable and \
                            parser.name in parser_used:
                        continue
                    else:
                        parser_used.add(parser.name)

                    succ = parser.fetch_info(ctx, sr)
                    if not succ:
                        continue
                    found = True
                    if ctx.existing is not None:
                        log_info("Found {0} results in db".format(len(ctx.existing)))
                        return [ctx.existing]

    # no metadata or downloadable source found
    if not found and len(download_candidates) == 0:
        return None
    # Save data, return data and start downloading
    try:
        pid = new_paper(ctx)
        ret = [{'_id': pid,
                'title': ctx.title,
                'view_cnt': 1,
                'download_cnt': 0
               }]
        ret[0].update(ctx.meta)

        progress_dict[pid] = 0.0
        if len(download_candidates) > 0:
            thread = Thread(target=start_download, args=(download_candidates,
                                                         ctx, pid))
            thread.start()
        return ret
    except:
        log_exc("Failed to save to db")