def main(argv):
  try:
    opts, args = getopt.getopt(argv, "q:l:o", ["query=", "limit=", "output="])
  except:
    print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input'
    sys.exit(2)

  # Init defaults
  limit = 10
  query = False
  outputDirectory = "./input"

  # Parse arguments
  for opt, arg in opts:
    if opt == "-q":
      query = arg
    elif opt == "-h":
      print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input'
    elif opt == "l":
      limit = arg
    elif opt == "o":
      outputDirectory = arg

  if not(query):
    print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input'
    sys.exit(2)

  if not(os.path.isdir(outputDirectory)):
    print "Given output directory is not a directory."
    sys.exit(2)
  else:
    os.chdir(outputDirectory)

  res = arxiv.query(query, prune=True, start=0, max_results=limit)

  i = 0
  results = []

  # Display titles
  for elem in res:
    for key in elem:
      if key == "title":
        i += 1
        results.append(elem)
        print str(i) + ". " + elem[key] + "\n"

  toDownload = input("Enter the numbers of the papers you want to download separated by commas: \n")

  try:
    iterator = iter(toDownload)
  except TypeError:
    arxiv.download(results[toDownload])
  else:
    for elem in toDownload:
      arxiv.download(results[elem])
Exemple #2
0
def arxiv_search(query, message):
    try:
        arxiv_search_res = arxiv.query(search_query=query, max_results=3)
        query_answer = ''
        for paper in arxiv_search_res:
            end = '…' if len(paper['summary']) > 251 else ''
            a_name = paper['authors'][0]
            if len(paper['authors'])>1:
                a_name += 'et al.'
            query_answer += \
                '• {0}. <a href="{1}">{2}</a>. {3}{4}\n'.format(
                        a_name, paper['arxiv_url'],
                        escape(paper['title'].replace('\n', ' ')),
                        escape(paper['summary'][0:250].replace('\n', ' ')),
                        end)
        print(query_answer)
        user_action_log(message, "called arxiv search with query {}".format(query))
        my_bot.reply_to(message, query_answer, parse_mode="HTML")

    except Exception as ex:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        action_log("Unknown Exception:\n{}: {}\nat {} line {}\n"
                   "Creating the alert file.".format(exc_type, ex, fname, exc_tb.tb_lineno))
                            shutil.move(pdf, path + pdf)
                            print("Move the original pdf to " + path)

                    else:
                        if not os.path.exists(non_arxiv_dir):
                            os.makedirs(non_arxiv_dir)
                        shutil.move(pdf, non_arxiv_dir + pdf)
                        print('Move the original pdf to ' + non_arxiv_dir)
                    print('')

                    time.sleep(waittime)
                    continue

                metadata = PdfReader(pdf).Info
                if '/arxiv_id' in metadata and '/updated' in metadata:
                    paper = arxiv.query(
                        id_list=[metadata['/arxiv_id'][1:-1]])[0]
                    updated = metadata['/updated'][1:-1]
                else:
                    paper = arxiv.query(id_list=[pdf[:-4].split('v')[0]])[0]
                    updated = ''

                authors = paper['authors']
                title = ' '.join(paper['title'].split())
                year = str(paper['published_parsed'].tm_year)
                arxiv_id = paper['id'].split('/')[-1].split('v')[0]
                authors = paper['authors'][0].split(' ')[-1]
                title = title.replace(':', '').replace(',', '').replace(
                    '-', ' ').replace('/', ' ')
                pdf_name = authors + ' (' + year + ') - ' + title + '.pdf'

                print()
def arxiv_query_title(title):
    """Query arxiv for papers with given title."""
    query = 'ti:"{}"'.format(title.replace('-', ' '))
    return arxiv.query(search_query=query)
def update_articles():
    # Updating records
    ordered_articles = pd.read_json("DeepLearningArticles.json",
                                    orient='index')
    unique_id = article_id(ordered_articles)
    version_no = [item[-1] for item in unique_id]

    # Dictionary for existing article {unique id: version number}
    article_dict = {}
    for i in range(len(unique_id)):
        article_dict[unique_id[i][:-2]] = version_no[i]

    # Getting new articles, change max_results
    new_articles = arxiv.query(search_query,
                               max_results=100,
                               sort_by="lastUpdatedDate",
                               sort_order="descending")
    new_articles_df = pd.DataFrame.from_dict(new_articles)
    ordered_new_articles = new_articles_df.reindex(columns=[
        'title', 'author', 'authors', 'id', 'arxiv_comment',
        'arxiv_primary_category', 'published', 'summary', 'tags', 'updated'
    ])

    unique_id_new = article_id(ordered_new_articles)
    prefix = 'http://arxiv.org/abs/'
    counter = 0

    for item in unique_id_new:
        article_key = item[:-2]

        # Adding newly published articles (new articles appended at the front)
        if article_key not in article_dict:
            article_dict[article_key] = item[-1]
            ordered_articles = pd.concat([
                ordered_new_articles[
                    lambda ordered_new_articles: ordered_new_articles[
                        'id'] == prefix + item], ordered_articles
            ],
                                         axis=0,
                                         sort=False,
                                         ignore_index=True)
            print("Added a new paper.")
            counter += 1

        # Updating old versions
        else:
            if int(item[-1]) > int(article_dict[article_key]):
                old_article_id = prefix + item[:-1] + article_dict[article_key]
                article_index = ordered_articles.index.get_loc(
                    ordered_articles.index[ordered_articles['id'] ==
                                           old_article_id][0])
                ordered_articles.drop(labels=article_index,
                                      axis=0,
                                      inplace=True)
                article_dict[article_key] = item[-1]
                ordered_articles = pd.concat([
                    ordered_new_articles[
                        lambda ordered_new_articles: ordered_new_articles[
                            'id'] == prefix + item], ordered_articles
                ],
                                             axis=0,
                                             sort=False,
                                             ignore_index=True)
                print("Updated a newer version.")
                counter += 1

    ordered_articles.to_json(OUTPUT_FILE, orient='index')
    prettify_json(OUTPUT_FILE)
    print("Update completed: {} updates made.".format(counter))
Exemple #6
0
from datetime import datetime
import re
import requests
from googletrans import Translator
from time import sleep

import arxiv

#webhook POST先URL
API_URL = "https://maker.ifttt.com/trigger/arxivLine/with/key/fdpKKfnpX20wqLHVNK2r4zom5lnmyU3jlBVzZ6zAfk2"
#検索ワード
QUERY = "cat:'astro-ph.IM"
result_list = arxiv.query(query=QUERY, max_results=2, sort_by='submittedDate')

translator = Translator()

dt = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
requests.post(API_URL, data={"value1": dt})


def translate_post():
    title_jpn = translator.translate(title, src='en', dest='ja').text
    abst_jpn = translator.translate(abst, src='en', dest='ja').text
    print("---------" + str(count) + "ページ目----------")
    print("author{}".format(author))
    print(url)
    print("title:{}".format(title_jpn))
    print("date:{}".format(date))
    print("Abstract:{}".format(abst_jpn))

    message = "\n".join([
async def arxiv_random(message):
    user_action_log(message, "made arxiv random query")
    try:
        eastern = pytz.timezone('US/Eastern')
        eastern_time = datetime.datetime.now(eastern)
        # publications on 20:00
        if eastern_time.hour < 20:
            eastern_time -= datetime.timedelta(days=1)
        # no publications on friday and saturday
        if eastern_time.weekday() == 5:
            eastern_time -= datetime.timedelta(days=2)
        elif eastern_time.weekday() == 4:
            eastern_time -= datetime.timedelta(days=1)
        last_published_date = eastern_time.strftime("%Y-%m-%d")
        response = requests.get('http://export.arxiv.org/oai2',
                                params={
                                    'verb': 'ListIdentifiers',
                                    'set': 'math',
                                    'metadataPrefix': 'oai_dc',
                                    'from': last_published_date
                                })
        action_log("Random arxiv paper since {}".format(last_published_date))
        # если всё хорошо
        if response.status_code == 200:
            response_tree = ElementTree.fromstring(response.content)
            num_of_papers = len(response_tree[2])
            paper_index = random.randint(0, num_of_papers)
            paper_arxiv_id = response_tree[2][paper_index][0].text.split(':')[
                -1]  # hardcoded
            papep_obj = arxiv.query(id_list=[paper_arxiv_id])[0]
            paper_link = papep_obj['pdf_url'].replace('http://',
                                                      'https://') + '.pdf'
            paper_link_name = paper_link.split("/pdf/")[1]
            print(paper_link)
            print(paper_link_name)
            req_pdf_size = requests.head(paper_link)
            pdf_size = round(
                int(req_pdf_size.headers["Content-Length"]) / 1024 / 1024, 2)
            query_answer = '{}. <a href="{}">{}</a>. {}\n\n— <a href="{}">{}</a>, {} Мб\n'.format(
                papep_obj['author_detail']['name'], papep_obj['arxiv_url'],
                escape(papep_obj['title'].replace('\n', ' ')),
                escape(papep_obj['summary'].replace('\n', ' ')), paper_link,
                paper_link_name, pdf_size)
            await message.reply(query_answer,
                                parse_mode="HTML",
                                disable_web_page_preview=False)
            user_action_log(
                message, "arxiv random query was successful: "
                "got paper {}".format(papep_obj['arxiv_url']))
            # TODO(randl): doesn't send. Download and delete?
            # my_bot.send_document(message.chat.id, data=paper_link)
        elif response.status_code == 503:
            # слишком часто запрашиваем
            action_log("Too much queries. 10 minutes break should be enough")
            arxiv_checker.last_call = datetime.datetime.utcnow(
            ) - datetime.timedelta(seconds=610)
        else:
            # если всё плохо
            user_action_log(
                message, "arxiv random query failed: response {}".format(
                    response.status_code))

    except Exception as ex:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        action_log("Unknown Exception: {}: {}\nat {} line {}".format(
            exc_type, ex, fname, exc_tb.tb_lineno))
Exemple #8
0
    if not dics["urls"] and not dics["photos"]:
        print("skip", flush=True)
        continue

    # arxivのリンク情報を埋める
    for url in dics["urls"]:
        # Arxivだった場合
        if 'arxiv' in url:
            arxivid_ = url.split('/')[-1]
            ids = [s for s in arxivid_.split('.') if s[0].isdigit()]

            arxivid__ = ""
            for i in range(len(ids)):
                arxivid__ += ids[i] + "."
            arxivid = arxivid__[:-1]
            arxivtitle = arxiv.query(id_list=[arxivid])[0]['title']

        # Youtubeの場合
        if 'youtu' in url:  # if it is youtube link
            imgurl = subprocess.check_output("youtube-dl --get-thumbnail \"" +
                                             url + "\"",
                                             shell=True)
            youtubetitle = subprocess.check_output(
                "youtube-dl --get-title \"" + url + "\"", shell=True)
            # solve decode problem
            if type(imgurl) == bytes:
                imgurl = imgurl.decode("utf-8", "ignore")
            if type(youtubetitle) == bytes:
                youtubetitle = youtubetitle.decode("utf-8", "ignore")

    # photosの画像があるならそちらを優先
Exemple #9
0
#!/usr/bin/env python
import pprint

# import requests
import arxiv
import pandas as pd

from googletrans import Translator
translator = Translator()

# p_list = arxiv.query(query='au:"Grisha Perelman"')
p_list = arxiv.query(query='au:"Henggang Cui"')

# num = len(p_list)

# print(type(l))

# print(type(l[0]))

#pprint.pprint(l[0], width=200)

for i in p_list:
    print(
        "\n\n\n----------------------------------------------------------------------------------------------------"
    )
    print("\nタイトル:\n" +
          translator.translate(i['title'], src='en', dest='ja').text + '\n(' +
          i['title'] + ')')
    print("\npublished:\n" + i['published'])
    print("\nauthors:")
    for j in i['authors']:
Exemple #10
0
    file.write(response.read())
    file.close()
    print('下载论文' + " " + savePath)
    uploaded = drive.CreateFile({'title': savePath})
    #uploaded.SetContentString(response.read())
    uploaded.SetContentFile('./' + savePath)
    uploaded.Upload()
    print("保存至google drive 云盘成功")
    os.remove('./' + savePath)


keywords = 'Machine Learning'
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('接入google drive')

list = axv.query(search_query=keywords, start=0, max_results=2000)
print('搜索arxiv上关键字:{} 相关的论文'.format(keywords))

print('共找到arxiv收录论文{}篇'.format(len(list)))

print('开始下载...')

for obj in iter(list):
    filename = obj['title'] + ".pdf"
    filename = filename.replace(' ', '_').replace('/', '_').replace('\n', '_')
    pdf_url = obj['pdf_url']
    download_arxiv_pdf(drive, filename, pdf_url)
Exemple #11
0
##### Arxive.org
##############################
import arxiv
aa = arxiv.query(search_query='machine', start=0, max_results=10)
Exemple #12
0
def get_arxiv_link(bot, msg):
    results = arxiv.query(msg['text'].replace("/arxiv ", ""), max_results=1)
    bot.sender.sendMessage("Title: {}\nAuthor: {}".format(results[0]['title'], results[0]['author']))
    bot.sender.sendDocument(open(arxiv.download(results[0]), "rb"))
 def search_n(self, query, n):
     self.logger.info(query)
     ret = arxiv.query(query=query, max_results=n)
     return self.shaping(ret)
 def search_n_random(self, query, n):
     self.logger.info(query)
     ret = arxiv.query(query=query, max_results=n)
     results = random.sample(ret, 4) if len(ret) >= 4 else ret
     return self.shaping(results)
def main():

    # ################### #
    #      SETUP ARGS     #
    # ################### #

    parser = argparse.ArgumentParser(
        description='Fetch all information for papers')
    parser.add_argument('database',
                        metavar='Database',
                        type=str,
                        help='database name of data collection')
    parser.add_argument('number_papers',
                        metavar='Number of Papers',
                        type=int,
                        help='number of papers to be downloaded')
    parser.add_argument(
        'skip_items',
        metavar='Number items to skip',
        type=int,
        help='number of items to skip from returned collection')
    parser.add_argument(
        'version',
        metavar='Version of overview csv',
        type=int,
        help='versioning number used to name overview csv for databases')

    args = parser.parse_args()
    database = args.database
    number_papers = args.number_papers
    skip_items = args.skip_items
    version = args.version

    client = MongoClient('localhost:4321')
    db = client.pub
    booktitles = ['TREC']

    # ########################### #
    #      FETCH PUBLICATIONS     #
    # ########################### #

    # print("Fetching publication information from TSE-NER server; publication attributes, has_pdf, number_entities, #citations_pub, #citations_author: ")

    for booktitle in booktitles:
        papers = []
        paper_info = [
        ]  #[_id, number_entities, year, ee, dblpkey, journal, title, type]
        counter_pub = 0
        counter_pdf = 0
        counter_cit = 0
        facets_columns = ';'.join(facets)
        results = db.publications.find({
            'booktitle': booktitle
        }).skip(skip_items).limit(number_papers).batch_size(100)
        print(
            f'Fetching {results.count(True)} out of {results.count()} total publications information for conference: {booktitle}'
        )

        querier = scholar.ScholarQuerier()
        settings = scholar.ScholarSettings()
        querier.apply_settings(settings)
        # querier.save_cookies()
        scholar_query = scholar.SearchScholarQuery()

        for pub in results:
            if not pub['title'] or not pub['authors']: continue

            counter_pub += 1
            author1 = pub['authors'][0]
            title = pub['title'].lower().capitalize().strip('.')
            paper_info = [
                pub['_id'], 'false', '-1;-1', '-1', booktitle, pub['ee'],
                pub['year'],
                "'%s'" % title, pub['type'], 'author1;author2'
            ]
            no_accent_author1 = unidecode.unidecode(author1)

            # Set author info
            authors = ''
            for author in pub['authors']:
                authors += f'{author};'
            paper_info[9] = authors.strip(';')

            pdf_file_path = f'{ROOTPATH}/data/{database}/{booktitle.lower()}/pdf/'
            os.makedirs(os.path.dirname(pdf_file_path), exist_ok=True)

            # Have multiple PDF fetch methods: Direct EE link, Arxiv
            # ADD ELSAPY
            if pub['ee'][-4:].lower() == '.pdf':
                paper_info[1] = 'true'
                download_pdf(pdf_file_path, paper_info[5], database, booktitle,
                             paper_info[0])
            else:
                arxiv_query = f'au:{author1}+AND+ti:{title}'
                articles = arxiv.query(search_query=arxiv_query)
                if articles:
                    for art in articles:
                        if art['title'].lower().capitalize().strip(
                                '.') == title:
                            paper_info[5] = art['pdf_url']
                            paper_info[1] = 'true'
                            arxiv.download(art, pdf_file_path, slugify=True)
                            os.rename(
                                f'{pdf_file_path}{to_slug(art["title"])}.pdf',
                                f'{pdf_file_path}{paper_info[0]}.pdf')

                            print(f'Finished PDF download for {paper_info[0]}')

            # Download full text
            if 'content' in pub.keys() and 'fulltext' in pub['content'].keys():
                write_full_text_file(paper_info[0], database, booktitle,
                                     pub['content']['fulltext'])

            # Get distinct #entities for total facets
            # ADD PROPER ENTITIES EXTRACTION
            facets_entities = ''
            for facet in facets:
                entities = fetch_paper_entities(pub['_id'], facet, db)
                facets_entities += f'{len(entities)};'

                # Write paper facet entity set to TXT
                write_entity_set_file(pub['_id'], booktitle, entities,
                                      database, facet)

            paper_info[2] = facets_entities.strip(';')

            # Only fetch citations if a PDF has been downloaded
            if paper_info[1] == 'true':
                counter_pdf += 1
                # Get number of citations info
                scholar_query.set_author(no_accent_author1)
                scholar_query.set_phrase(title)
                scholar_query.set_num_page_results(1)
                querier.send_query(scholar_query)

                # Print the URL of the first article found
                if querier.articles and title == querier.articles[0][
                        'title'].lower().capitalize().strip('.'):
                    print(
                        f'Fetched number citations for {paper_info[0]}: {querier.articles[0]["num_citations"]}'
                    )
                    paper_info[3] = querier.articles[0]['num_citations']
                    counter_cit += 1

            # Add paper information to list
            papers.append(paper_info)
            print(f'✓ {pub["_id"]}')

            # Write papers information to CSV file
            if counter_pub % 20 is 0:
                print('----- STATISTICS -----')
                print("Processed:", counter_pub)
                write_arrays_to_csv(papers, booktitle, database, [
                    'paper_id', 'has_pdf', facets_columns, 'number_citations',
                    'booktitle', 'pdf_url', 'year', 'title', 'type', 'authors'
                ], skip_items, version)
                print(
                    f'PDFs downloaded for {counter_pdf}/{counter_pub} publications for {booktitle}'
                )
                print('----------------------')

        print('----- FINAL STATISTICS -----')
        print("Processed:", counter_pub)
        write_arrays_to_csv(papers, booktitle, database, [
            'paper_id', 'has_pdf', facets_columns, 'number_citations',
            'booktitle', 'pdf_url', 'year', 'title', 'type', 'authors'
        ], skip_items, version)
        print(
            f'PDFs downloaded for {counter_pdf}/{counter_pub} publications for {booktitle}'
        )
        print('-----------------------')
        print(
            f'Finished processing {counter_pub} publications and downloading {counter_pdf} PDFs for {booktitle}'
        )
    "-t",
    "--topic",
    type=str,
    nargs=1,
    default="text summarization",
    help=
    'Topic to query arXiv for (surround in quotes if topic includes spaces). Default: "text summarization"'
)
parser.add_argument(
    "-n",
    "--num_results",
    type=int,
    default=10,
    help="Number of research articles to download from arXiv. Default: 10")

args = parser.parse_args()

results = arxiv.query(query=args.topic,
                      max_results=args.num_results,
                      iterative=True)

for i, paper in enumerate(results()):

    paper_info = {'title': paper['title'], 'url': paper['pdf_url']}

    pdf_filename = paper_info['url'].split('/')[-1]
    pdf_dir = "../downloads/"

    os.makedirs(pdf_dir, exist_ok=True)

    arxiv.arxiv.download(paper, dirpath=pdf_dir)
#def getrefname(b):
    #i,j = b.index('{'),  b.index(',') 
    #return b[i+1:j]

for refname, arxivid, selected in publist:
    query = "SELECT {} from documents WHERE bibtex LIKE '%{}%'".format(','.join(headers), refname)
    refres = list(ref.con.execute(query).next())
    author, title, year, filename, bibtex = refres
    bibdic = ref.parse_bibtex(bibtex)
    author = bibdic['author'] # include firstnames 
    abstract = ''
    #refname = getrefname(bibtex)
    print refname
    if arxivid:
        arxivres = arxiv.query(id_list=[arxivid])[0]
        author = ', '.join(arxivres['authors'])
        abstract = arxivres['summary']
        assert filter(str.isalnum, str(arxivres['title'])).lower() == filter(str.isalnum, title).lower()
    # yaml long strings with gt (>) then indented lines.
    abstract = '\n'.join('    '+li for li in abstract.split('\n'))
    ###### WRITE YAML ALWAYS
    yaml =  """---
title: "{}"
author: "{}"
journal: "{}"
year: {}
arxiv: "{}"
shortname: {}
thumbnail: /{}/{}.png
excerpt: ""
Exemple #18
0
def found():
    branch = request.form['engine']  #ieee or arxiv
    keyword = request.form['keyword']
    noofresults = request.form['number']
    noofresults = int(noofresults)
    print('hello')
    branch = branch.lower()
    if branch == 'arxiv':
        print('hi')
        result = arxiv.query(query=keyword, max_results=noofresults)
        data = pd.DataFrame(
            columns=["Title", 'Published Date', 'Download Link'])
        for i in range(len(result)):
            title = result[i]['title']
            arxiv_url = result[i]['arxiv_url']
            arxiv_url = arxiv_url.replace('abs', 'pdf')
            published = result[i]['published']
            data_tmp = pd.DataFrame(
                {
                    "Title": title,
                    "Published Date": published,
                    "Download Link": arxiv_url
                },
                index=[0])
            data = pd.concat([data,
                              data_tmp]).reset_index(drop=True)  #dataframe
        return render_template('searchengine.html',
                               tables=[
                                   data.to_html(
                                       render_links=True,
                                       classes=['table table-bordered'])
                               ])

    elif branch == 'scholar':
        headings = []
        links = []

        def getdata(url):
            headers = {
                "User-Agent":
                "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"
            }
            r = requests.get(url, headers=headers)
            return r.text

        if __name__ == "__main__":

            search = keyword
            noOfLinks = noofresults
            x = search.split(" ")
            search = "+".join(x)
            i = 0
            while i < 1000:
                data = getdata(
                    f"https://scholar.google.com/scholar?start={i}&q={search}")
                soup = BeautifulSoup(data, "lxml")
                # print(soup.prettify())
                terms = soup.findAll('a')
                for term in terms:
                    if ".pdf" in term["href"]:
                        links.append(term['href'])
                for term in terms:
                    if ".pdf" in term["href"]:
                        x = soup.findAll(
                            'a',
                            attrs={"data-clk-atid": term["data-clk-atid"]})
                        for y in x:
                            z = y.getText()
                            if "[PDF]" in z:
                                pass
                            else:
                                headings.append(z)
                i += 10

            d = {"Title": headings, "PDFLink": links}

            df = pd.DataFrame(d)

            print("Titles and PDF links are as follow:")
            if df.index.stop >= noOfLinks:
                fdf = df[:noOfLinks]
            else:
                fdf = df
            return render_template('searchengine.html',
                                   tables=[
                                       fdf.to_html(
                                           render_links=True,
                                           classes=['table table-bordered'])
                                   ],
                                   current_user=current_user)
    elif branch == 'nature':
        headings = []
        alinks = []
        abstracts = []
        datePublished = []
        authors = []

        # Get data from website
        def getdata(url):
            r = requests.get(url)
            return r.text

        if __name__ == "__main__":

            # Getting initial soup
            search = keyword
            noOfResults = noofresults
            x = search.split(" ")
            search = "+".join(x)
            data = getdata(f"https://www.nature.com/search?q={search}")
            soup = BeautifulSoup(data, "lxml")

            # Getting Headings
            terms = soup.findAll('span', attrs={"class": "visually-hidden"})
            terms = terms[6:]
            terms = terms[:-9]
            for term in terms[:noOfResults]:
                term = term.getText()[12:]
                term = term[:-49]
                headings.append(term)

            # Getting Links
            links = soup.findAll('a',
                                 attrs={"data-track-action": "search result"})
            for link in links[:noOfResults]:
                alinks.append(f"https://www.nature.com{link['href']}")

            # Getting abstracts, publication dates and authors
            for alink in alinks[:noOfResults]:
                dt = getdata(alink)
                sp = BeautifulSoup(dt, "lxml")
                tm = sp.find('div',
                             attrs={"class": "c-article-section__content"})
                date = sp.find('time', attrs={"itemprop": "datePublished"})
                author = sp.findAll('a', attrs={"data-test": "author-name"})

                if len(author) > 0:
                    auth = []
                    for autho in author:
                        auth.append(autho.getText())
                    authors.append(auth)
                else:
                    authors.append("Authors not mentioned")

                if date:
                    datePublished.append(date.getText())
                else:
                    datePublished.append("Date Published not mentioned")

                if tm:
                    tm = tm.find("p").getText()
                    abstracts.append(tm)
                else:
                    abstracts.append("Abstract Not Available")

            # Creating dictionary of the lists
            d = {
                "Heading": headings,
                "ArticleLink": alinks,
                "ArticleAbstract": abstracts,
                "PublicationDate": datePublished,
                "Authors": authors
            }

            # Creating dataframe
            df = pd.DataFrame(d)
            return render_template('searchengine.html',
                                   tables=[
                                       df.to_html(
                                           render_links=True,
                                           classes=['table table-bordered'])
                                   ],
                                   current_user=current_user)

    elif branch == 'ieee':
        titles_list = []
        links_list = []
        date_list = []
        abstract_list = []
        citation_list = []
        abstract_list = []
        author_list = []
        page_no = 1
        no = math.ceil(noofresults / 25)
        for page_no in range(1, no + 1):

            headers = {
                "Accept": "application/json, text/plain, */*",
                "Origin": "https://ieeexplore.ieee.org",
                "Content-Type": "application/json",
            }
            payload = {
                "newsearch": True,
                "queryText": keyword,
                "highlight": True,
                "returnFacets": ["ALL"],
                "returnType": "SEARCH",
                "pageNumber": page_no
            }
            r = requests.post("https://ieeexplore.ieee.org/rest/search",
                              json=payload,
                              headers=headers)
            page_data = r.json()
            for record in page_data["records"]:
                titles_list.append(record["articleTitle"])
                links_list.append('https://ieeexplore.ieee.org' +
                                  record["documentLink"])
                date_list.append(record["publicationDate"])
                citation_list.append(record["citationCount"])
                key = 'abstract'
                if key in record:
                    abstract_list.append(record['abstract'])
                else:
                    abstract_list.append('Abstract not present')

            d = {
                "Title": titles_list,
                "Abstract": abstract_list,
                "Link": links_list,
                "Publication Date": date_list,
                "No of Citations": citation_list
            }

            df = pd.DataFrame.from_dict(d)
            finaldf = df[:noofresults]  #dataframe
            return render_template('searchengine.html',
                                   tables=[
                                       finaldf.to_html(
                                           render_links=True,
                                           classes=['table table-bordered'])
                                   ],
                                   current_user=current_user)
                  columns=[
                      "Date", "Time", "Location", "#", "Session",
                      "Session Title", "Paper ID", "Paper Title", "Authors",
                      "Title-arxiv", "Abstract", "URL"
                  ])
pa_table = soup.select("table")[3]
index = 0
for i in range(len(pa_table.find_all('td'))):
    i_column = i % 9
    if pa_table.find_all('td')[i].string is not None:
        df.iloc[index, i_column] = pa_table.find_all('td')[i].string
    if i_column == 8:
        title = df.iloc[index, 7]
        tokens = title.replace(":", "").replace("-", " ").strip().split()
        s = tokens[0] + "+AND+all:" + "+AND+all:".join(tokens[1:])

        try:
            results = arxiv.query(s, prune=True, start=0, max_results=1)
        except:
            pass

        if len(results) != 0:
            df.iloc[index, 9] = results[0]['title'].replace("\n ", "")
            df.iloc[index, 10] = results[0]['summary'].replace("\n", " ")
            df.iloc[index, 11] = results[0]['pdf_url']

        print(str(index) + "/783 done")
        index += 1

df.to_csv('paper_lists.csv')
#Directory to save papers in
dest = str(sys.argv[3])
dest = dest.replace("'", "")
if dest[-1] != "/":
    dest = dest + "/"

#Maximum number of papers to download
max_len = int(sys.argv[4])

#Search within papers
search_pdf = str(sys.argv[5])
search_pdf = search_pdf.replace("'", "")
search_pdf = search_pdf.replace(",", " AND")

result = arxiv.query(query=search_arxiv, max_results=max_len)

j = 1

for paper in result:  #for each paper found in the search

    if 'links' in paper:  #if the paper has the attribute 'links'
        links = paper['links']
        for i in links:  #for every link
            if i['type'] == 'application/pdf':  #if that link is a pdf
                h = i['href']  #save the href of that link in 'h'
                file = "pdf" + str(j) + ".pdf"  #creating the file name
                dest_file = dest + file  #concatenating the filename with the fimepath input by user
                wget.download(h, dest_file)  #download the pdf to the directory
                j = j + 1
            if not bmask:
                try:
                    links = get_links_ads(bibcode, q='arxiv')
                except:  # HttpError:
                    print("rate limited ", time.localtime())
                    time.sleep(300)
                    continue

                if links:

                    arxivid = links[0].split('/')[-1]
                    aid = arxivid.split(':')[-1]

                    try:
                        # search with api
                        paper = arxiv.query(id_list=[aid])[0]
                        arxiv.download(paper)  #,prefer_source_tarfile=True)
                        pdffile = glob.glob("{}*.pdf".format(aid))[0]
                    except:
                        try:
                            section = links[0].split('/')[-2].split(':')[-1]
                            url = "https://arxiv.org/pdf/{}/{}.pdf".format(
                                section, aid)
                            pdffile = "{}.pdf".format(aid)
                            urllib.request.urlretrieve(url, pdffile)
                        except:
                            import pdb
                            pdb.set_trace()
                            continue

                    # parse pdf
Exemple #22
0
'''
import arxiv
import json

predate = '2018-12-19'
_date = '2018-12-20'
# 14853 2018-12-13
with open('log.json', 'r') as f:
    data = json.load(f)

_start = data['log'][predate]['start'] + data['log'][predate]['cnt']

cnt = 0

paper = arxiv.query(search_query='cat:cs.CV',
                    start=int(_start),
                    max_results=50)
print(type(paper), len(paper), paper)
md = '# Latest CV paper updated in ' + _date

for item in paper:
    if item['updated'][:10] == _date:
        cnt += 1
        downurl = item['id']
        title = item['title_detail']['value'].replace('\n', ' ')
        author = ','.join(item['authors'])
        summary = item['summary'].replace('\n', ' ')

        md += '\n'
        md += '\n'
        md += '#### {_order}. {_title}'.format(_order=cnt, _title=title)
Exemple #23
0
import arxiv
import json
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase

with open('./params.json') as json_file:
    params = json.load(json_file)
    search_string = ' AND '.join(
        '( ' + ' OR '.join([f'"{item}"' for item in ors]) + ' )'
        for ors in params["stage1"])
print(search_string)
# Multi-field queries
result = arxiv.query(search_query=search_string, max_results=100)
print(result[0])

biblist = []
for item in result:
    bibitem = {}
    bibitem['ENTRYTYPE'] = 'article'
    bibitem['ID'] = item['id']
    bibitem['abstract'] = item['summary']
    bibitem['title'] = item['title']
    bibitem['journal'] = 'arxiv'
    bibitem['author'] = ' and '.join(
        [', '.join(author.rsplit(' ', 1)[::-1]) for author in item['authors']])
    bibitem['year'] = str(item['published_parsed'].tm_year)
    bibitem['month'] = str(item['published_parsed'].tm_mon)
    bibitem['url'] = item['pdf_url']
    biblist.append(bibitem)

db = BibDatabase()
Exemple #24
0
def main(url, lang, small, path):

    if not (lang in ['JA', 'RU', 'PL', 'NL', 'IT', 'PT', 'ES', 'FR', 'DE']):
        print('Error: NOT SUPPORT LANGUAGE', file=sys.stderr)
        sys.exit(1)
    else:
        print('Mode: {}'.format(lang))


    if url != 'None':
        arXiv_id = url[22:]
        result_list = arxiv.query(id_list=[arXiv_id], max_results=1)

        if len(result_list) < 1:
            print('Error: NOT FOUND PAPER', file=sys.stderr)
            sys.exit(1)
        else:
            print('Done: Found paper')

        result = result_list[0]


    # Prepare paper summary

        Summary = {}
        Summary["title"] = result.title.replace("\n", " ")
        Summary["author"] = result.author
        Summary["arxiv_url"] = result.arxiv_url
        Summary["pdf_url"] = result.pdf_url
        Summary["date"] = result.updated
        Summary["abstract"] = result.summary.replace("-\n", "").replace("\n", " ").replace(". ", ".\n")

        Summary_JP = {}
        Summary_JP["title"] = tr.traslateBydeepL(result.title.replace("\n", " "), lang)
        Summary_JP["author"] = result.author
        Summary_JP["arxiv_url"] = result.arxiv_url
        Summary_JP["pdf_url"] = result.pdf_url
        Summary_JP["date"] = result.updated
        Summary_JP["abstract"] = tr.traslateBydeepL(result.summary.replace("-\n", "").replace("\n", " ").replace(". ", ".\n"), lang)


    # Download PDF

        def PDFdownload(url, title):
            urllib.request.urlretrieve(url, "{0}".format(title))

        now_path = os.path.dirname(os.path.abspath(__file__))

        data_path = now_path + '/data/{}'.format(arXiv_id.replace("/", "").replace(".", ""))
        os.makedirs(data_path, exist_ok=True)
        pdf_url = result.pdf_url
        pdf_title = ''.join([data_path, '/', arXiv_id.replace("/", "").replace(".", ""), '.pdf'])
        PDFdownload(pdf_url, pdf_title)


    # Correct documents in PDF

    # Chapter Class
        class Chapter:
            def __init__(self, title, pagenum):
                self.title = title
                self.body = ''
                self.pagenum = pagenum
                return

            def getTitle(self):
                return self.title

            def getPagenum(self):
                return self.pagenum

            def addBody(self, text):
                self.body = ''.join([self.body, text])

            def getBody(self):
                return self.body


        # Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。
        laparams = LAParams(detect_vertical=True)

        # 共有のリソースを管理するリソースマネージャーを作成。
        resource_manager = PDFResourceManager()

        # ページを集めるPageAggregatorオブジェクトを作成。
        device = PDFPageAggregator(resource_manager, laparams=laparams)

        # Interpreterオブジェクトを作成。
        interpreter = PDFPageInterpreter(resource_manager, device)

        Chapters = []
        nowC = Chapter('Abstract', 1)

        print()
        print('-' * 30)  # 読みやすいよう区切り線を表示する。
        print('CHAPTER LIST')
        print('-' * 30)  # 読みやすいよう区切り線を表示する。
        with open(pdf_title, 'rb') as f:
            # PDFPage.get_pages()にファイルオブジェクトを指定して、PDFPageオブジェクトを順に取得する。
            # 時間がかかるファイルは、キーワード引数pagenosで処理するページ番号(0始まり)のリストを指定するとよい。
            for page_num, page in enumerate(PDFPage.get_pages(f)):
                interpreter.process_page(page)  # ページを処理する。
                layout = device.get_result()  # LTPageオブジェクトを取得。

                # ページ内のテキストボックスのリストを取得する。
                boxes = pr.find_textboxes_recursively(layout)

                # テキストボックスの左上の座標の順でテキストボックスをソートする。
                # y1(Y座標の値)は上に行くほど大きくなるので、正負を反転させている。
                # boxes.sort(key=lambda b: (b.x0, -b.y1))

                emp = " "
                piriod = ". "
                chainp = ".. "

                for box in boxes:

                    pdf_text = box.get_text()

    # 整形処理
                    # タイトル入っちゃってるやつ消す
                    pdf_text = pdf_text.replace(Summary["title"] + '\n', "")
                    # 単語の途中で改行対処(普通の単語に直す)
                    pdf_text = pdf_text.replace("-\n", "")
                    # 単語の間や文末に改行消去
                    pdf_text = pdf_text.replace("\n", " ")
                    # figのピリオド対処
                    pdf_text = pdf_text.replace("fig.", "fig").replace("Fig.", "Fig")
                    # 3以上の長さのスペース入り連続ピリオドを圧縮
                    for i in range(3, 7):
                        pdf_text = pdf_text.replace(piriod * i, "...")
                    pdf_text = pdf_text.replace(chainp, "..")
                    # タイトルっぽいやつのピリオドスペースは消しとく
                    if nowC.getTitle() != 'References' and nowC.getTitle() != 'REFERENCES':
                        for i in range(1, 10):
                            pdf_text = re.sub('{}\. '.format(i), '{} '.format(i), pdf_text)
                    # ピリオドスペースで改行
                    pdf_text = pdf_text.replace(". ", ".\n")
                    # どんな長さのスペースも1つのスペースに
                    for i in range(1, 7):
                        pdf_text = pdf_text.replace(emp * i, " ")

                    if re.findall(
                            '^§?[A-Z]\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[A-Z]\.[1-9]+\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[A-Z]\.[1-9]\.[1-9]+\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+\.[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+\.[1-9]+\.[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^Acknowledgements*|^Acknowledgments*|^ACKNOWLEDGMENTS*|^References*|^REFERENCES*|^Introduction*|^INTRODUCTION*',
                            pdf_text) and len(pdf_text) > 8:
                        # Subtitleの処理
                        print(pdf_text)
                        Chapters.append(nowC)
                        nowC = Chapter(re.findall(
                            '^§?[A-Z]\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[A-Z]\.[1-9]+\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[A-Z]\.[1-9]\.[1-9]+\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+\.[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+\.[1-9]+\.[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^Acknowledgements*|^Acknowledgments*|^ACKNOWLEDGMENTS*|^References*|^REFERENCES*|^Introduction*|^INTRODUCTION*',
                            pdf_text)[0], page_num + 1)
                    elif re.findall('^arXiv:*', pdf_text) and nowC.getTitle() != 'References' and nowC.getTitle() != 'REFERENCES' and nowC.getTitle() != 'References ':
                        # arxiv_infoの処理
                        arxiv_info = re.findall('^arXiv:*', pdf_text)
                    else:
                        if re.findall('^References*|^REFERENCES*', nowC.getTitle()):
                            # 参考文献の処理
                            pdf_text = pdf_text.replace("\n", " ")
                            # .で終わってたら改行する
                            if len(pdf_text) > 2 and pdf_text[-2] == '.':
                                pdf_text += ('\n')
                        else:
                            if len(pdf_text) < 10:
                                # 短すぎるやつは多分数式とかなので改行を消す
                                pdf_text.replace("\n", " ")
                        nowC.addBody(pdf_text)
        Chapters.append(nowC)

        # 翻訳概要表示

        print('-' * 30)  # 読みやすいよう区切り線を表示する。
        print('日本語概要')
        for k, v in Summary_JP.items():
            print('-' * 30)  # 読みやすいよう区切り線を表示する。
            if k == 'abstract':
                print('[概要]')
                print(v)
            else:
                print('{} : {}'.format(k, v))


        # 導入と結論だけ早めに出しちゃう
        translated_text_conclusion = ''
        translated_text_introduction = ''
        for c in Chapters:
            if re.findall('Conclusion*|CONCLUSION*', c.getTitle()):
                print('-' * 30)  # 読みやすいよう区切り線を表示する。
                print('[結論]')
                transtext = ""

                transtext_list = c.getBody().split(sep='\n')
                ind = 0
                while ind < len(transtext_list):
                    while ind < len(transtext_list) and len(transtext) < 3000:
                        transtext = ''.join([transtext, transtext_list[ind], '\n'])
                        ind += 1
                    translated_text_conclusion = ''.join([translated_text_conclusion, tr.traslateBydeepL(transtext, lang)])
                    transtext = ''
                print(translated_text_conclusion)
            elif re.findall('Introduction*|INTRODUCTION*', c.getTitle()):
                print('-' * 30)  # 読みやすいよう区切り線を表示する。
                print('[導入]')
                transtext = ""

                transtext_list = c.getBody().split(sep='\n')
                ind = 0
                while ind < len(transtext_list):
                    while ind < len(transtext_list) and len(transtext) < 3000:
                        transtext = ''.join([transtext, transtext_list[ind], '\n'])
                        ind += 1
                    translated_text_introduction = ''.join([translated_text_introduction, tr.traslateBydeepL(transtext, lang)])
                    transtext = ''
                print(translated_text_introduction)

    # smallだったらここで打ち切り
        if small:
            print('-' * 30)  # 読みやすいよう区切り線を表示する。
            print('Thank you !')
            return



    # 本文翻訳
        print('-' * 30)  # 読みやすいよう区切り線を表示する。
        Chapters_JP = []
        nowC = Chapter('Abstract', 1)

        for c in Chapters:
            nowC = Chapter(tr.traslateBydeepL(c.getTitle(), lang), c.getPagenum())
            transtext = ""
            translated_text = ""

            transtext_list = c.getBody().split(sep='\n')
            ind = 0
            if c.getTitle() != 'References' and c.getTitle() != 'REFERENCES':
                if re.findall('Conclusion*|CONCLUSION*', c.getTitle()):
                    nowC.addBody(translated_text_conclusion)
                elif re.findall('Introduction*|INTRODUCTION*', c.getTitle()):
                    nowC.addBody(translated_text_introduction)
                else:
                    while ind < len(transtext_list):
                        while ind < len(transtext_list) and len(transtext) < 3000:
                            transtext = ''.join([transtext, transtext_list[ind], '\n'])
                            ind += 1
                        translated_text = ''.join([translated_text, tr.traslateBydeepL(transtext, lang)])
                        transtext = ''
                    nowC.addBody(translated_text)
            else:
                nowC.addBody(c.getBody())
            Chapters_JP.append(nowC)
            print('Done Translating : Chapter {}'.format(c.getTitle()))
        print('-' * 30)  # 読みやすいよう区切り線を表示する。

    # 出力作業
        # 出力用のマークダウンファイル
        en_md = open('{}/output_{}_EN.md'.format(data_path, arXiv_id.replace("/", "").replace(".", "")), 'w')
        jp_md = open('{}/output_{}_JP.md'.format(data_path, arXiv_id.replace("/", "").replace(".", "")), 'w')

        # 目次マーカー付与
        en_md.write('[TOC]\n\n')
        jp_md.write('[TOC]\n\n')

        # 英語概要
        for k, v in Summary.items():

            for i in range(1, 30):
                v = re.sub('\[{}\]'.format(i), '\[\^{}\]'.format(i), v)
            if k == "arxiv_url" or k == "pdf_url":
                if k == "arxiv_url":
                    en_md.write(''.join(['## ', 'URL']))
                    en_md.write('\n')
                else:
                    en_md.write('\n')
                en_md.write(''.join([k, ' : ', '[', v, ']', '(', v, ')']))
            else:
                if k == "title":
                    en_md.write(''.join(['# ', v]))
                else:
                    en_md.write(''.join(['## ', k]))
                    en_md.write('\n')
                    en_md.write(v.replace("\n", "<br>"))
            en_md.write('\n')

        # 訳語概要
        for k, v in Summary_JP.items():

            for i in range(1, 30):
                v = re.sub('\[{}\]'.format(i), '\[\^{}\]'.format(i), v)
            if k == "arxiv_url" or k == "pdf_url":
                if k == "arxiv_url":
                    jp_md.write(''.join(['## ', 'URL']))
                    jp_md.write('\n')
                else:
                    jp_md.write('\n')
                jp_md.write(''.join([k, ' : ', '[', v, ']', '(', v, ')']))
            else:
                if k == "title":
                    jp_md.write(''.join(['# ', v]))
                else:
                    jp_md.write(''.join(['## ', k]))
                    jp_md.write('\n')
                    jp_md.write(v.replace("\n", "<br>"))
            jp_md.write('\n')

        # 英語本文
        for c in Chapters[1:]:
            if c.title[1] == '.':
                en_md.write(''.join(['### ', c.title, ' --------- P.{}'.format(c.pagenum)]))
            else:
                en_md.write(''.join(['## ', c.title, ' --------- P.{}'.format(c.pagenum)]))
            en_md.write('\n')

            if c.title != 'References' and c.title != 'References ' and c.title != 'REFERENCES':
                for i in range(1, 30):
                    c.body = re.sub('\[{}\]'.format(i), '[^{}]'.format(i), c.body)
            else:
                for i in range(1, 30):
                    c.body = re.sub('\[{}\]'.format(i), '[^{}]:'.format(i), c.body)

            en_md.write(c.body.replace("\n", "<br>"))
            en_md.write('\n')

        # 訳語本文
        for c in Chapters_JP[1:]:
            if len(c.title) > 1 and c.title[1] == '.':
                jp_md.write(''.join(['### ', c.title, ' --------- P.{}'.format(c.pagenum)]))
            else:
                jp_md.write(''.join(['## ', c.title, ' --------- P.{}'.format(c.pagenum)]))
            jp_md.write('\n')

            if c.title != '参考文献':
                for i in range(1, 30):
                    c.body = re.sub('\[{}\]'.format(i), '[^{}]'.format(i), c.body)
            else:
                for i in range(1, 30):
                    c.body = re.sub('\[{}\]'.format(i), '[^{}]:'.format(i), c.body)
            jp_md.write(c.body.replace("\n", "<br>"))
            jp_md.write('\n')

        en_md.close()
        jp_md.close()


    # Output HTML from MarkDown
        import markdown
        md = markdown.Markdown(extensions=['admonition', 'toc', 'footnotes'])

        with open('{}/output_{}_EN.md'.format(data_path, arXiv_id.replace("/", "").replace(".", ""))) as fen:
            with open('{}/output_{}_EN.html'.format(data_path, arXiv_id.replace("/", "").replace(".", "")), 'w') as hen:
                md_en = fen.read()
                body = md.convert(md_en)
                # HTML書式に合わせる
                html = '<html lang="ja"><meta charset="utf-8"><body>'
                html += (body + '</body></html>')
                hen.write(html)

        with open('{}/output_{}_JP.md'.format(data_path, arXiv_id.replace("/", "").replace(".", ""))) as fjp:
            with open('{}/output_{}_JP.html'.format(data_path, arXiv_id.replace("/", "").replace(".", "")), 'w') as hjp:
                md_jp = fjp.read()
                body = md.convert(md_jp)
                # HTML書式に合わせる
                html = '<html lang="ja"><meta charset="utf-8"><body>'
                html += (body + '</body></html>')
                hjp.write(html)

    # htmlファイルをブラウザ表示
        import webbrowser

        jp_url = 'file://{}/output_{}_JP.html'.format(
                data_path, arXiv_id.replace("/", "").replace(".", ""))
        en_url = 'file://{}/output_{}_EN.html'.format(
                data_path, arXiv_id.replace("/", "").replace(".", ""))

        webbrowser.open_new(jp_url)
        webbrowser.open_new(en_url)

        print('-' * 30)  # 読みやすいよう区切り線を表示する。
        print('Thank you !')
    elif path != 'None':
        ep.translate(path, small, lang)
    else:
        print('Please use -u option or -p option.\ndetail : https://pypi.org/project/eigoyurusan/')
Exemple #25
0
def download_arxiv(url, output):
    papers = arxiv.query(id_list=[url.split('/')[-1]])
    paper = papers[0] if len(papers) > 0 else None
    if paper is None: return None
    arxiv.download(paper, slugify=lambda x: output)
    return paper
Exemple #26
0
import arxiv

# Get an interator over query results
result = arxiv.query(
  query="GAN",
  max_chunk_results=10,
  max_results=1,
  iterative=True
)

for paper in result():
   #print(paper)
   pass

#print(paper)
print(paper['title'])
print(paper['summary'])
print(paper['arxiv_url'])
for l in paper:
   print(l)
   #print('\n')
        QUERY = "cs."+ category[rand_cat]
        print(rand_cat,category[rand_cat])

    QUERY = "cs.AI OR cs.CV"
    for i in category:
        QUERY += "OR cs.{}".format(i)

    
#     QUERY = "cs.AI"
    dt = datetime.now().strftime("%Y%m%d")
    dt = str(int(dt)-2)
    # dt=str(20201130) #デバック用(ここは2020というように4桁表示)
    print(str(int(dt)))
    # translator = Translator() 
    # result_list = arxiv.query(query = 'cat:cs.AI AND submittedDate:[20201223000001 TO 20201223235959]',max_results=2)
    result_list = arxiv.query(query = 'cat:{} AND submittedDate:[{}000001 TO {}235959]'.format(QUERY,dt,dt),max_results=50,sort_by='submittedDate')
    # print(result_list[1])
    def translate(text):
        tr = Translator(service_urls=['translate.googleapis.com'])
        while True:
            try:
                text_ja = tr.translate(text,src="en", dest="ja").text
                return text_ja
                break
            except Exception as e:
                tr = Translator(service_urls=['translate.googleapis.com'])


    def translate_post(df):
        title_jpn = translate(title)
        abst_jpn = translate(abst)
Exemple #28
0
    # url is arxiv url
    # id is arxiv id
    df = pd.read_csv("files_with_url.csv")
    return df


def get_notes(df)
    notes = []
    for item in df.iterrows():
        url = item[1].url
        file = item[1].File
        arxiv_id = item[1].id
        with open(f"{NOTES_DIR}{file}") as file_:
            file_contents = file_.read()
        print(file, arxiv_id)
        arxiv_obj = arxiv.query(id_list = [str(arxiv_id)])[0]
        pdf_url = url.replace("abs","pdf") + ".pdf"
        notes.append(dict(url=url, pdf_url=pdf_url, arxiv_id=arxiv_id, file_contents=file_contents,arxiv_obj=arxiv_obj))
    return notes


def complete_notes(notes):
    for note in notes:
        note["authors"] = note["arxiv_obj"]['authors']
        note["published"] = note["arxiv_obj"]["published"][:7]
        note["abs"] = note["arxiv_obj"]["summary_detail"]["value"].replace("\n", " ")
        note["notes"] = note["file_contents"].split("\n\n")[1:-1]
        note["tags"] = [tag.capitalize() for tag in note["file_contents"].split("\n\n")[-1].replace("#", "").replace("\n", "").replace("_", " ").split(", ")]
        note["title"] = note["arxiv_obj"]['title'].replace("\n", " ").replace("  ", " ").replace("  ", " ")
    return notes
Exemple #29
0
#                 conn_matrix[i,:] = temp
#
#         # fig = plt.figure(figsize=(10,5));
#         # plt.spy(conn_matrix, marker ='s', color='chartreuse', markersize=5);
#         # plt.xlabel('Authors');
#         # plt.ylabel('Articles');
#         # plt.title('Authors of the articles', fontweight='bold');
#
#         return conn_matrix

search_query = str(sys.argv[1])
max_results = int(sys.argv[2])

results = arxiv.query(search_query=search_query,
                      start=0,
                      max_results=max_results,
                      sort_by="submittedDate",
                      sort_order="descending")

title, authors, date, summary, tags, pdf_url = store_results(results)

cat_counts = categorize_tags(tags)

author_counts = categorize_authors(authors)

cat_histogram(cat_counts, search_query, max_results)

author_histogram(author_counts, search_query, max_results)

unique_authors = get_unique_authors(authors)
Exemple #30
0
import arxiv

results = arxiv.query(query="learning analytics education", max_results=1000)
print("Found ", len(results), " papers.")
#print(results)
for paper in results:
    print("Downloading: ", paper['title'], " ", paper['published'])
    #arxiv.download(paper)
if 'GATEWAY_INTERFACE' in os.environ:
    args = cgi_to_dict()
else:
    args = dict(arg.split('=') for arg in sys.argv[1:])

#
## Main
#

start = int(args.get('start', 0))
length = int(args.get('length', 50))
query = args.get('query', QUERY)
res = arxiv.query(query=query,
                  max_results=length,
                  sort_by="submittedDate",
                  sort_order="descending",
                  start=start)

print(f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>ArXiv search results</title>
    <link rel="stylesheet" href="https://arxiv.org/css/arXiv.css">
</head>
<body class="with-cu-identity">
<div id="header">
<h1>arXiv.org – {query} – start={start}</h1>
</div>""")
#!/usr/bin/env python
import pprint

#import requests
import arxiv
import pandas as pd

l = arxiv.query(query='au:"Grisha Perelman"')

#print(type(l))

#print(type(l[0]))

#pprint.pprint(l[0], width=200)

print("\nauthor:\n" + l[0]['author'])

print("\ntitle:\n" + l[0]['title'])

print("\narxiv_url:\n" + l[0]['arxiv_url'])

print("\npdf_url:\n" + l[0]['pdf_url'])

print("\nsummary:\n" + l[0]['summary'])

#response = requests.post("http://localhost:3000/paper/create/")
#print(response.status_code)
#print(response.text)
Exemple #33
0
import arxiv
import pprint

results = arxiv.query('residual')

pp = pprint.PrettyPrinter(indent=1)

for i, item in enumerate(results):
    title = item['title']
    category = item['arxiv_primary_category']['term']
    summary = item['summary']
    published = item['published']
    updated = item['updated']
    print(title, category)
Exemple #34
0
def arxiv_random(message):
    user_action_log(message, "made arxiv random query")
    try:
        eastern = pytz.timezone('US/Eastern')
        eastern_time = datetime.datetime.now(eastern)
        # publications on 20:00
        if eastern_time.hour < 20:
            eastern_time -= datetime.timedelta(days=1)
        # no publications on friday and saturday
        if eastern_time.weekday() == 5:
            eastern_time -= datetime.timedelta(days=2)
        elif eastern_time.weekday() == 4:
            eastern_time -= datetime.timedelta(days=1)
        last_published_date = eastern_time.strftime("%Y-%m-%d")
        response = requests.get('http://export.arxiv.org/oai2',
                                params={'verb'          : 'ListIdentifiers',
                                        'set'           : 'math',
                                        'metadataPrefix': 'oai_dc',
                                        'from'          : last_published_date})
        action_log("Random arxiv paper since {}".format(last_published_date))
        # если всё хорошо
        if response.status_code == 200:
            response_tree = ElementTree.fromstring(response.content)
            num_of_papers = len(response_tree[2])
            paper_index = random.randint(0, num_of_papers)
            paper_arxiv_id = response_tree[2][paper_index][0].text.split(':')[-1]  # hardcoded
            papep_obj = arxiv.query(id_list=[paper_arxiv_id])[0]
            paper_link = papep_obj['pdf_url'].replace('http://', 'https://') + '.pdf'
            paper_link_name = paper_link.split("/pdf/")[1]
            print(paper_link)
            print(paper_link_name)
            req_pdf_size = requests.head(paper_link)
            pdf_size = round(int(req_pdf_size.headers["Content-Length"]) / 1024 / 1024, 2)
            
            a_name = papep_obj['authors'][0]
            if len(papep_obj['authors'])>1:
                a_name += 'et al.'
                
            query_answer = '{}. <a href="{}">{}</a>. {}\n\n— <a href="{}">{}</a>, {} Мб\n'.format(
                    a_name,
                    papep_obj['arxiv_url'],
                    escape(papep_obj['title'].replace('\n', ' ')),
                    escape(papep_obj['summary'].replace('\n', ' ')),
                    paper_link,
                    paper_link_name,
                    pdf_size
            )
            my_bot.reply_to(message, query_answer, parse_mode="HTML", disable_web_page_preview=False)
            user_action_log(message,
                            "arxiv random query was successful: "
                            "got paper {}".format(papep_obj['arxiv_url']))
            # TODO(randl): doesn't send. Download and delete?
            # my_bot.send_document(message.chat.id, data=paper_link)
        elif response.status_code == 503:
            # слишком часто запрашиваем
            action_log("Too much queries. 10 minutes break should be enough")
            arxiv_checker.last_call = datetime.datetime.utcnow() - datetime.timedelta(seconds=610)
        else:
            # если всё плохо
            user_action_log(message, "arxiv random query failed: response {}".format(response.status_code))

    except Exception as ex:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        action_log("Unknown Exception: {}: {}\nat {} line {}".format(exc_type, ex, fname, exc_tb.tb_lineno))
import arxiv
from py2neo import Graph

client = MongoClient('mongodb', 27017)
db = client['arxiv']
Papers = db["Papers"]

Papers.drop()

uri = "http://neo4j:7474"
password = "******"
graph = Graph(uri, password=password)
graph.run("MATCH (n) DETACH DELETE n")

## insersion base
for paper in arxiv.query(query="quantum", max_results=1000):
    Papers.insert_one(paper)

cursor_paper = Papers.find()

for paper in cursor_paper:
    create_paper = "CREATE (p:PAPER {id: '%s' })" % paper["id"]
    match_authors = ""
    link_authors = ""
    for i, author in enumerate(paper["authors"]):
        match_authors += "MERGE (u%s:AUTHOR {name:\"%s\"}) \n" % (i, author)
        link_authors += "MERGE (u%s)-[:AUTHORED {author_rank: %s}]->(p) \n" % (
            i, i)
    match_tags = ""
    link_tags = ""
    for i, tag in enumerate(paper["tags"]):