Exemple #1
0
 def test_sleep_multiple_requests(self, patched_time_sleep):
     client = arxiv.Client(page_size=1)
     url1 = client._format_url(arxiv.Search(query="quantum"), 0, 1)
     url2 = client._format_url(arxiv.Search(query="testing"), 0, 1)
     # Rate limiting is URL-independent; expect same behavior as in
     # `test_sleep_standard`.
     client._parse_feed(url1)
     patched_time_sleep.assert_not_called()
     client._last_request_dt = datetime.now()
     client._parse_feed(url2)
     patched_time_sleep.assert_called_once_with(
         approx(client.delay_seconds, rel=1e-3))
Exemple #2
0
def dbsearch_arxiv(
    data: list[dict],
    silent: bool = False,
) -> dict:
    """
    Check online databases (can be slow!).

    :param silent: Hide status bar.
    :return: Dictionary with discovered items.
    """

    output = defaultdict(lambda: defaultdict(list))

    # find arxivid based on journal doi

    for entry in tqdm.tqdm(data, disable=silent):
        iden = get_identifiers(entry)
        if "arxivid" in iden:
            continue
        if "doi" not in iden:
            continue
        doi = iden["doi"]
        ret = []
        for result in arxiv.Search(query=f'"{doi}"').results():
            ret.append(
                re.sub(r"(http)(s?)(://arxiv.org/abs/)(.*)", r"\4",
                       result.entry_id))
        for i in ret:
            output[entry["ID"]]["arxivid"].append(i)

    # arXiv preprint: check if journal id is present
    # possible optimisation: can be made faster to bluntly skip all entries that have a doi

    for entry in tqdm.tqdm(data, disable=silent):
        iden = get_identifiers(entry)
        if "arxivid" not in iden:
            continue
        ret = []
        for result in arxiv.Search(id_list=[iden["arxivid"]]).results():
            ret.append(result.doi)
        if "doi" in iden:
            ret = [i for i in ret if i != iden["doi"]]
        for i in ret:
            output[entry["ID"]]["doi"].append(i)

    for key in output:
        output[key] = dict(output[key])

    return dict(output)
Exemple #3
0
    def _get_queries_from_last_day(self, max_results=100):
        queries = []

        # get all queries in the categories in the last day
        for category in self._categories:
            num_category_added = 0
            new_queries = [Query(q) for q in arxiv.Search(query=category, sort_by=arxiv.SortCriterion.SubmittedDate, max_results=max_results).get()]
            num_category_added += len(new_queries)
            queries += [q for q in new_queries if q.is_recent]

        # get rid of duplicates
        queries_dict = {q.id: q for q in queries}
        unique_keys = set(queries_dict.keys())
        queries = [queries_dict[k] for k in unique_keys]

        # only keep queries that contain keywords
        queries = [q for q in queries if max([k.lower() in str(q).lower() for k in self._keywords])]

        # sort from most recent to least
        queries = sorted(queries, key=lambda q: (datetime.now(timezone('GMT')) - q.date).total_seconds())

        # filter if previously sent
        prev_arxivs = self._get_previously_sent_arxivs()
        queries = [q for q in queries if q.id not in prev_arxivs]
        self._save_previously_sent_arxivs(queries)
        
        return queries
 def test_get_short_id(self):
     result_id = "1707.08567"
     result = next(arxiv.Search(id_list=[result_id]).get())
     got = result.get_short_id()
     self.assertTrue(got.startswith(result_id))
     # Should be of form `1707.08567v1`.
     self.assertTrue(re.match(r'^{}v\d+$'.format(result_id), got))
Exemple #5
0
 def test_query_page_count(self):
     client = arxiv.Client(page_size=10, delay_seconds=0)
     client._parse_feed = MagicMock(wraps=client._parse_feed)
     generator = client.get(arxiv.Search(query="testing", max_results=55))
     results = [r for r in generator]
     self.assertEqual(len(results), 55)
     self.assertEqual(client._parse_feed.call_count, 6)
Exemple #6
0
def is_valid_arxiv_id(arxiv_id: str) -> bool:
    search = arxiv.Search(id_list=[arxiv_id])
    try:
        next(search.results())
        return True
    except:
        return False
 def test_result_shape(self):
     max_results = 100
     search = arxiv.Search("testing", max_results=max_results)
     results = [r for r in search.get()]
     self.assertEqual(len(results), max_results)
     for result in results:
         self.assert_valid_result(result)
Exemple #8
0
    def run(self):

        # search arXiv database
        try:
            search = arxiv.Search(id_list=[self.arguments[1]])
            paper = next(search.results())
        except Exception:
            return []

        # generate journal link nodes
        ret_node = nodes.paragraph()

        journal = ""
        if paper.journal_ref:
            journal += f", {paper.journal_ref}, "
        if paper.doi:
            journal += f"doi: {paper.doi}"

        ret_node += nodes.Text(f"[{self.arguments[0]}] ")
        ret_node += nodes.Text(", ".join([author.name for author in paper.authors]) + ", ")
        ret_node += nodes.emphasis(text=f"{paper.title}")
        if journal:
            ret_node += nodes.Text(journal)
        ret_node += nodes.Text(" ")
        ret_node += nodes.reference(text="(open)", refuri=paper.pdf_url)

        return [ret_node]
Exemple #9
0
def get_accurate_name_from_arxiv(paper_title: str):
    # arxiv query by paper title is shitty
    # we use google search to get the arxiv_id
    arxiv_url = list(search(f'{paper_title} site:arxiv.org', stop=1))[0]
    arxiv_id = re.findall(r'\d+\.\d+', arxiv_url)[0]

    paper = arxiv.Search(id_list=[arxiv_id])[0]
    return paper['title']
Exemple #10
0
 def gen_record(document_id, primary_doc, gen_links):
     """Generate record from arxiv url.
     # example document_id: https://arxiv.org/abs/1810.04805
     arxiv reference: https://arxiv.org/help/api/user-manual#_calling_the_api
     # api url = 'http://export.arxiv.org/api/query?id_list=1311.5600'
     """
     paper_id = document_id.split("abs/")[-1]
     search = arxiv.Search(id_list=[paper_id])
     result = next(search.get())
     record = gen_arxiv_record_from_result(result, primary_doc=primary_doc)
     return record
def find_in_arxiv(paper_url):
    paper_id = paper_url.split('/')[-1]
    if 'pdf' in paper_id:
        paper_id = paper_id[:-4]

    paper = next(arxiv.Search(id_list=[paper_id]).get())
    year = paper.published.year
    authors = [{"name": i.name} for i in paper.authors]
    title = paper.title

    return year, authors, title, paper_id
Exemple #12
0
def search_arxiv(queries, max_results=100):
    '''
    This function will search arxiv associated to a set of queries and store
    the latest 10000 (max_results) associated to that search.
    
    params:
        queries (List -> Str) : A list of strings containing keywords you want
                                to search on Arxiv
        max_results (Int) : The maximum number of results you want to see associated
                            to your search. Default value is 1000, capped at 300000
                            
    returns:
        This function will return a DataFrame holding the following columns associated
        to the queries the user has passed. 
            `title`, `date`, `article_id`, `url`, `main_topic`, `all_topics`
    
    example:
        research_df = search_arxiv(
            queries = ['automl', 'recommender system', 'nlp', 'data science'],
            max_results = 10000
        )
    '''
    d = []
    searches = []
    # hitting the API
    for query in queries:
        search = arxiv.Search(query=query,
                              max_results=max_results,
                              sort_by=arxiv.SortCriterion.SubmittedDate,
                              sort_order=arxiv.SortOrder.Descending)
        searches.append(search)

    # Converting search result into df
    for search in searches:
        for res in search.results():
            data = {
                'title': res.title,
                'date': res.published,
                'article_id': res.entry_id,
                'url': res.pdf_url,
                'main_topic': res.primary_category,
                'all_topics': res.categories,
                'authors': res.authors
            }
            d.append(data)

    d = pd.DataFrame(d)
    d['year'] = pd.DatetimeIndex(d['date']).year

    # change article id from url to integer
    unique_article_ids = d.article_id.unique()
    article_mapping = {art: idx for idx, art in enumerate(unique_article_ids)}
    d['article_id'] = d['article_id'].map(article_mapping)
    return d
Exemple #13
0
 def test_sleep_standard(self, patched_time_sleep):
     client = arxiv.Client(page_size=1)
     url = client._format_url(arxiv.Search(query="quantum"), 0, 1)
     # A client should sleep until delay_seconds have passed.
     client._parse_feed(url)
     patched_time_sleep.assert_not_called()
     # Overwrite _last_request_dt to minimize flakiness: different
     # environments will have different page fetch times.
     client._last_request_dt = datetime.now()
     client._parse_feed(url)
     patched_time_sleep.assert_called_once_with(
         approx(client.delay_seconds, rel=1e-3))
 def dl_paper(self):
     """Download pdf paper with 'paper_id' in working directory."""
     # search paper id in arxiv list
     search = arxiv.Search(id_list=[self.paper_id])
     # get paper object
     paper = next(search.get())
     # extract paper title
     self.paper_title = paper.title
     # download paper as pdf
     paper.download_pdf(filename=self.paper_name)
     # load paper content
     self.paper_content = extract_text(self.paper_name)
Exemple #15
0
 def test_sleep_elapsed(self, patched_time_sleep):
     client = arxiv.Client(page_size=1)
     url = client._format_url(arxiv.Search(query="quantum"), 0, 1)
     # If _last_request_dt is less than delay_seconds ago, sleep.
     client._last_request_dt = (datetime.now() -
                                timedelta(seconds=client.delay_seconds - 1))
     client._parse_feed(url)
     patched_time_sleep.assert_called_once()
     patched_time_sleep.reset_mock()
     # If _last_request_dt is at least delay_seconds ago, don't sleep.
     client._last_request_dt = (datetime.now() -
                                timedelta(seconds=client.delay_seconds))
     client._parse_feed(url)
     patched_time_sleep.assert_not_called()
Exemple #16
0
def query_recent(category):
    """
    Query the arxiv for the updates of the last day for a given category
    """
    results = arxiv.Search(
        query=category, max_results=75, sort_by=arxiv.SortCriterion.LastUpdatedDate
    ).results()
    elements = []
    for _, element in enumerate(results):
        time_s = element.updated
        if not is_today(time_s):
            break
        elements.append(element)
    return elements
Exemple #17
0
def main():
    parser = argparse.ArgumentParser(description='Paper to transfer to Remarkable')
    parser.add_argument('paper', type=str, help='ArVix code or path to file')

    ARGS = parser.parse_args()

    path = ARGS.paper
    if is_arxiv_code(ARGS.paper):
        paper = list(arxiv.Search(id_list=[ARGS.paper]).results())[0]
        name = to_slug(paper)
        paper.download_pdf(dirpath="/tmp/", filename=name+'.pdf')
        print("Adding " + paper.title + " to Remarkable")
        path = "/tmp/" +  to_slug(paper) + ".pdf"

    call(["rmapi", "put", path])
Exemple #18
0
def arxiv_query_info(arxiv_id_raw):
    """
    Returns extra information about the queried paper
    """
    arxiv_id = url_to_id(arxiv_id_raw)
    paper = next(arxiv.Search(id_list=[arxiv_id]).results())
    title = paper.title
    authors = [str(i) for i in paper.authors]
    abstract = paper.summary.replace("\n", " ")
    msg = f""" > {arxiv_id}
Title: {paper.title}

Authors: {authors}

Abstract: {abstract}
    """
    return msg
Exemple #19
0
def parse_arxiv(command):
    """
    Hacky way to parse out an an arxiv ID from a sentence
    """
    links = re.findall(ARXIV_REGEX, command)
    arxiv_ids = []
    for link in links:
        print(link)
        if 'arxiv' not in link:
            continue
        arxiv_id = link.split('/')[-1]
        arxiv_id = arxiv_id.split('.pdf')[0]
        arxiv_ids.append(arxiv_id)
    articles = []
    if len(arxiv_ids) > 0:
        articles = list(arxiv.Search(id_list=arxiv_ids).results())
    return articles
Exemple #20
0
def get_arxiv_id(paper_title: str, feel_lucky: bool = True):
    # arxiv query by paper title is shitty
    # we use google search to get the arxiv_id
    arxiv_url = list(search(f'{paper_title} site:arxiv.org', stop=1))[0]
    arxiv_id = re.findall(r'\d+\.\d+', arxiv_url)[0]

    # papers = arxiv.query(query=paper_title)
    paper = arxiv.Search(id_list=[arxiv_id])[0]
    if not feel_lucky:
        print(paper_title)
        print(paper['title'])
        if input('Should we continue') == 'n':
            return None

    # TODO assert paper_titile match paper.

    # for example http://arxiv.org/pdf/1911.05722v3 ->1911.05722v3
    return paper['pdf_url'].split('/')[-1]
Exemple #21
0
def build_content(query, query_config):
    domains = query['domains']
    keywords = query['keywords']
    total_mail = len(query['keywords'])

    subject_placeholder = 'arXiv newsletter ' + str(today) + ' {index}/' + str(
        total_mail)
    content_placeholder = '\n' + '*' * 35 + '\n ' + subject_placeholder + ' \n' + '*' * 35 + '\n'
    entry_placeholder = '{index}. {title}\n{authors}\nPublished at: {publish}\nUpdated at: {update}\nPrimary Category: {primary_category}\nCategories: {categories}\n{notes}\n{link}\n\nAbstract:\n{abstract}\n'

    messages = []

    for i, keyword in enumerate(keywords):
        query = build_query(domains, keyword)
        print(query)
        while True:
            try:
                results = arxiv.Search(query=query, **query_config)
                break
            except:
                pass
        entries = ''
        for j, result in enumerate(results.results()):
            entry = entry_placeholder.format(
                index=j + 1,
                title=result.title,
                authors=', '.join([author.name for author in result.authors]),
                publish=result.published,
                update=result.updated,
                primary_category=result.primary_category,
                categories=', '.join(result.categories),
                link='\n'.join([link.href for link in result.links]),
                abstract=result.summary,
                notes=f'Comments: {result.comment}\n'
                if result.comment is not None else '')
            entries += entry + '\n'
        subject = subject_placeholder.format(index=i + 1)
        content = content_placeholder.format(index=i + 1)
        content += '\nQuery: ' + keyword + '\n\n' + entries
        # content = textwrap.wrap(content, width=80, replace_whitespace=False)
        # content = '\n'.join(content)
        messages.append((subject, content))

    return messages
def get_arxiv_papers(
        query: str,
        fields: List = [
            "title", "authors", "date", "abstract", "journal", "doi"
        ],
        max_results: int = 99999,
        client_options: Dict = {"num_retries": 10},
        search_options: Dict = dict(),
):
    """
    Performs arxiv API request of a given query and returns list of papers with
    fields as desired.

    Args:
        query (str): Query to arxiv API. Needs to match the arxiv API notation.
        fields (List[str]): List of strings with fields to keep in output.
        max_results (int): Maximal number of results, defaults to 99999.
        client_options (Dict): Optional arguments for `arxiv.Client`. E.g.:
            page_size (int), delay_seconds (int), num_retries (int).
            NOTE: Decreasing 'num_retries' will speed up processing but might
            result in more frequent 'UnexpectedEmptyPageErrors'.
        search_options (Dict): Optional arguments for `arxiv.Search`. E.g.:
            id_list (List), sort_by, or sort_order.

    Returns:
        list of dicts. One dict per paper.

    """
    client = arxiv.Client(**client_options)
    search = arxiv.Search(query=query,
                          max_results=max_results,
                          **search_options)
    results = client.results(search)

    processed = [{
        arxiv_field_mapper.get(key, key):
        process_fields.get(arxiv_field_mapper.get(key, key),
                           lambda x: x)(value)
        for key, value in vars(paper).items()
        if arxiv_field_mapper.get(key, key) in fields
    } for paper in results]
    return processed
Exemple #23
0
def find_download_pdf(pdf_name, data):
    """Given a name of a pdf, downloads the pdf

    :param pdf_name: name of the pdf to download which contains to claim
    :type pdf_name:  str
    :param data: arxiv dataset which contains the details of all pdfs and their authors, links etc
    :type data:  pandas dataframe

    :return: all the content/text found in the pdf
    :rtype: str
    """

    #SSL Certificate to download pdf from link
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        # Legacy Python that doesn't verify HTTPS certificates by default
        pass
    else:
        # Handle target environment that doesn't support HTTPS verification
        ssl._create_default_https_context = _create_unverified_https_context

    df1 = data[data['title'].str.contains(pdf_name)]

    reference_id = {"id": df1.iloc[0, 1]}

    reference_get = next(arxiv.Search(id_list=[reference_id['id']]).get())
    paper_reference = reference_get.download_pdf()

    ref_text = textract.process(paper_reference, method='pdfminer')

    #Convert text type from bytes to string
    ref_text = ref_text.decode("utf-8")

    ref_text = ref_text.replace('\x0c', '')

    return ref_text
Exemple #24
0
 def test_max_results(self):
     client = arxiv.Client(page_size=10, delay_seconds=0)
     search = arxiv.Search(query="testing", max_results=2)
     results = [r for r in client.get(search)]
     self.assertEqual(len(results), 2)
Exemple #25
0
def crawler(query,
            sort_by,
            sort_order,
            page_size,
            subjectcategory,
            max_results=float('inf')):
    # 参数处理
    query = json.loads(query)
    subjectcategory = json.loads(subjectcategory)
    max_results = int(max_results) if isinstance(max_results, str) else max_results

    # client配置,每5秒一个API请求,出错重试5次
    client = arxiv.Client(
        page_size=int(page_size),
        delay_seconds=5,
        num_retries=5
    )

    for subject, key_words in query.items():
        query_results = defaultdict(list)
        db_set, arxiv_db_path = load_set(subject)

        # 每个关键字一个查询请求
        for key_word in key_words:
            search = arxiv.Search(
                query=key_word,
                max_results=max_results,
                sort_by=sort_by_dict[sort_by],
                sort_order=sort_order_dict[sort_order]
            )

            try:
                for result in client.get(search):
                    # 是否在指定的类别内
                    for cate in result.categories:
                        if cate in subjectcategory:
                            break
                    else:
                        continue

                    # 数据库中是否已存在
                    short_id = result.get_short_id()
                    if short_id in db_set:
                        continue
                    db_set.add(short_id)

                    year = result.updated.tm_year
                    ori = dict()
                    ori['title'] = result.title
                    ori['authors'] = [author.name for author in result.authors]
                    ori['updated_sorted'] = result.updated
                    # ori['published'] = time.strftime('%Y-%m-%d %H:%M:%S', result.published)
                    ori['updated'] = time.strftime('%Y-%m-%d %H:%M:%S', result.updated)
                    ori['summary'] = result.summary.replace('\n', ' ')
                    # ori['comment'] = result.comment
                    # ori['primary_category'] = result.primary_category
                    # ori['categories'] = result.categories
                    ori['pdf_url'] = result.get_pdf_url()
                    ori['short_id'] = result.get_short_id()
                    query_results[year].append(ori)
            except arxiv.UnexpectedEmptyPageError:
                print(f"{subject}--{key_word}: arxiv.UnexpectedEmptyPageError")
            except arxiv.HTTPError:
                print(f"{subject}--{key_word}: arxiv.HTTPError")
            except Exception as error:
                print(f"{subject}--{key_word}: {error}")

        # 解析存储结果
        for year, results in query_results.items():
            markdown_fp = os.path.join(arxiv_db_path, f'{year}.md')
            if os.path.exists(markdown_fp):
                old_results = load_markdown(markdown_fp)
                query_set = set([item['short_id'] for item in old_results])
                for item in results:
                    if item['short_id'] not in query_set:
                        old_results.append(item)
                results = old_results
            results = sorted(results, key=lambda item: item['updated_sorted'])

            markdown = []
            markdown.append(f"# {year}\n")

            toc = []
            content = defaultdict(list)
            for result in results:
                ym = result['updated'].rsplit('-', 1)[0]
                if ym not in toc:
                    toc.append(ym)
                paper = f"<details>\n\n<summary>{result['updated']} - {result['title']}</summary>\n\n" \
                        f"- *{', '.join(result['authors'])}*\n\n" \
                        f"- `{result['short_id']}` - [abs](http://arxiv.org/abs/{result['short_id']}) - [pdf]({result['pdf_url']})\n\n" \
                        f"> {result['summary']}\n\n" \
                        f"</details>\n\n"
                content[ym].append(paper)

            markdown.append("## TOC\n")
            toc = sorted(toc)
            markdown.append("\n".join([f"- [{t}](#{t})" for t in toc])+'\n')

            for ym, papers in content.items():
                markdown.append(f"## {ym}\n")
                markdown.append("".join(papers))

            with open(markdown_fp, "w", encoding='utf-8') as f:
                f.write("\n".join(markdown))

        if len(query_results) > 0:
            with open(os.path.join(arxiv_db_path, 'db.txt'), "w") as f:
                db_str = json.dumps(list(db_set))
                f.write(db_str)
def set_metadata(filename: str, title: str, author: str):
    args = [
        "exiftool",
        filename,
        "-overwrite_original",
        f"-Author={author}",
        f"-Title={title}",
    ]
    subprocess.run(args, capture_output=True, check=True)


lines = sys.stdin.readlines()
paper_ids = [parse_line(line.strip()) for line in lines]
paper_ids = [x for x in paper_ids if x is not None]
papers = arxiv.Search(id_list=paper_ids).results()

for paper, paper_id in zip(papers, paper_ids):
    src_filename = f"{paper_id}.pdf"
    dst_filename = paper_to_filename(paper)
    if os.path.exists(src_filename):
        print(f"[Rename] {src_filename}")
        os.rename(src_filename, dst_filename)
    else:
        print("[Download]")
        paper.download_pdf(filename=dst_filename)
    print(f"file:    {dst_filename}")
    print(f"url:     {paper.entry_id}")
    print(f"authors: {[str(x) for x in paper.authors]}")
    print(f"title:   {paper.title}\n")
    set_metadata(
Exemple #27
0
 def test_sleep_zero_delay(self, patched_time_sleep):
     client = arxiv.Client(page_size=1, delay_seconds=0)
     url = client._format_url(arxiv.Search(query="quantum"), 0, 1)
     client._parse_feed(url)
     client._parse_feed(url)
     patched_time_sleep.assert_not_called()
Exemple #28
0
 def test_invalid_id(self):
     results = list(arxiv.Search(id_list=["0000.0000"]).get())
     self.assertEqual(len(results), 0)
Exemple #29
0
 def test_no_duplicates(self):
     search = arxiv.Search("testing", max_results=100)
     ids = set()
     for r in search.get():
         self.assertFalse(r.entry_id in ids)
         ids.add(r.entry_id)
Exemple #30
0
 def broken_get():
     search = arxiv.Search(query="quantum")
     return next(broken_client.get(search))