Example #1
0
    def download_info(self):
        self.articles = arxivpy.query(search_query=['cond-mat.quant-gas'],
                                      start_index=0,
                                      max_index=30,
                                      sort_by='submittedDate')

        p_ = 'interesting_authors.csv'
        url = "https://www.dropbox.com/s/yismcsi2ti35qse/interesting_authors.csv?dl=1"
        u = urllib.request.urlopen(url)
        data = u.read()
        u.close()
        with open(p_, "wb") as f:
            f.write(data)
        with open(p_, 'r') as my_file:
            reader = csv.reader(my_file, delimiter=',')
            self.interesting_authors = list(reader)[0]

        p_ = 'interesting_keywords.csv'
        url = "https://www.dropbox.com/s/u9pqzmomoa0jgmm/interesting_keywords.csv?dl=1"
        u = urllib.request.urlopen(url)
        data = u.read()
        u.close()
        with open(p_, "wb") as f:
            f.write(data)
        with open(p_, 'r') as my_file:
            reader = csv.reader(my_file, delimiter=',')
            self.interesting_title_keywords = list(reader)[0]
def scrape_arxiv():
    articles = arxivpy.query(
        search_query=['cs.CV', 'cs.LG', 'cs.CL', 'cs.NE', 'stat.ML'],
        start_index=0,
        max_index=200,
        results_per_iteration=100,
        wait_time=5.0,
        sort_by='lastUpdatedDate')  # grab 200 articles
    arxivpy.download(articles, path='arxiv_pdf')
Example #3
0
def crawl_machine_learning(start_index: int, sort_order: str):
    conn = connect_database()

    machine_learning_categories = [
        'cs.CV', 'cs.CL', 'cs.LG', 'cs.AI', 'cs.NE', 'stat.ML'
    ]

    STEP = 100
    articles_per_minute = STEP * 2

    article_len = articles_per_minute

    if start_index == -1:
        start_index = Article.get_n_articles(conn) - STEP

    logging.info('crawling start')
    logging.info('start index : ' + str(start_index))
    logging.info('sort_order : ' + sort_order)

    while article_len == articles_per_minute:
        # query 100 results per iteration
        # wait 30 seconds per query
        try:
            articles = arxivpy.query(search_query=machine_learning_categories,
                                     start_index=start_index,
                                     max_index=start_index +
                                     articles_per_minute,
                                     results_per_iteration=STEP,
                                     wait_time=30,
                                     sort_by='lastUpdatedDate',
                                     sort_order=sort_order)

            # crawling log
            logging.info('last: ' + articles[-1]['published'])
            logging.info(str(start_index + STEP * 2) + ' articles crawled')

            # save articles
            for article in articles:
                Article(article, conn).save()

            # compute start_index
            start_index += STEP * 2

            # compute article_len
            article_len = len(articles)

            # sleep 5 minute
            time.sleep(MINUTE * 5)
        except Exception as e:
            logging.error(e)
            time.sleep(60 * 30)

    conn.close()
Example #4
0
def fetch_recent_cv_papers(filename, num=65536):
    papers = arxivpy.query(search_query=['cs.CV'],
                           start_index=0,
                           max_index=num - 1,
                           results_per_iteration=128,
                           wait_time=2.0,
                           sort_by='submittedDate')

    # Normalise articles
    for paper in papers:
        # Dates as strings
        paper['publish_date'] = paper['publish_date'].isoformat()
        paper['update_date'] = paper['update_date'].isoformat()

    with open(filename, 'w') as f:
        f.write(json.dumps(papers))
Example #5
0
import urllib.request, json
from phraseg import *
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import matplotlib as mpl
import csv
import arxivpy

mpl.rcParams['figure.dpi'] = 300

articles = arxivpy.query(search_query=['cs.CL'],
                         start_index=0,
                         max_index=500,
                         results_per_iteration=100,
                         wait_time=1.0,
                         sort_by='lastUpdatedDate')

datas = ""
for a in articles:
    datas += a['title'] + "\n"
    datas += a['abstract'] + "\n"
print("Finish fetching")

phraseg = Phraseg(datas, idf_chunk=300)
result = phraseg.extract(result_word_minlen=1, merge_overlap=True)

wordcloud = WordCloud(font_path='wordcloud/NotoSansCJKtc-Medium.otf',
                      width=1800,
                      height=1000,
                      margin=1,
                      background_color="white").fit_words(result)
Example #6
0
def crawl_category(term='cs.LG'):
    index_iteration = 500
    logging.info("Crawling category : %s", term)
    for index in range(start_index, end_index, index_iteration):
        logging.info("\nBatch : %d-%d" % (index, index + index_iteration))
        articles = arxivpy.query(search_query=[term],
                                 start_index=index,
                                 max_index=index + index_iteration,
                                 results_per_iteration=index_iteration,
                                 wait_time=0.2,
                                 sort_by='lastUpdatedDate')
        article_batch_count = len(articles)
        if article_batch_count == 0:
            logging.warning('Article not found in batch %d - %d' %
                            (index, index + index_iteration))
        for idx, article in tqdm(enumerate(articles),
                                 total=article_batch_count):
            arvixID = article['id'].split('v')[0]
            query = Paper.select().where(Paper.arvixID == arvixID)
            if query.exists():
                paper = Paper.get(Paper.arvixID == arvixID)
                categories = paper.category
                if term not in categories:
                    categories.append(term)
                Paper.update(category=categories).where(
                    Paper.arvixID == arvixID).execute()
                continue
            success, article_meta = get_arvixpaper_semantic_scholar(arvixID)
            if success is False:
                logging.debug(
                    "Paper not exists in semantic scholar, arvixID : %s" %
                    arvixID)
                continue
            authorIDList = [
                int(author['authorId'])
                if author['authorId'] is not None else -1
                for author in article_meta['authors']
            ]
            authorNames = [article['main_author']]
            authorCount = len(article_meta['authors'])
            if authorCount > 1:
                other_author = [
                    name.strip() for name in article['authors'].split(',')
                    if len(name) > 1 and name != article['main_author']
                ]
                authorNames += other_author
            paper_category = [article['term']]
            if article['term'] != term:
                paper_category.append(term)
            try:
                paper = Paper.create(
                    indexID=idx + index,
                    arvixID=arvixID,
                    paperId=article_meta['paperId'],
                    doiID=str(article_meta['doi']),
                    title=article['title'],
                    summary=article['abstract'],
                    category=paper_category,
                    comments=article['comment'],
                    journal_ref=article['journal_ref'],
                    url=article['url'],
                    authorID=authorIDList,
                    authorName=authorNames,
                    authorCount=authorCount,
                    publishedDate=article['publish_date'],
                    citationVelocity=article_meta['citationVelocity'],
                    referencesCount=len(article_meta['references']),
                    topics=article_meta['topics'],
                    venue=str(article_meta['venue']),
                    year=article_meta['year'],
                    influentialCitationCount=article_meta[
                        'influentialCitationCount'],
                    citationCount=len(article_meta['citations']),
                    citations=article_meta['citations'],
                )
                try:
                    for meta in ['page', 'figure', 'table']:
                        if meta in article['comment']:
                            comment = article['comment'].replace(';', ',')
                            for segment in comment.split(','):
                                if meta in segment:
                                    page_prefix = segment.split(meta)[0]
                                    if meta == 'page':
                                        paper.pages = int(page_prefix.strip())
                                    elif meta == 'figure':
                                        paper.figures = int(
                                            page_prefix.strip())
                                    elif meta == 'table':
                                        paper.table = int(page_prefix.strip())
                                    break
                except:
                    logging.debug("Error in parsing meta data")
                paper.save()
            except BaseException as e:
                logging.warning("Error in arvix id %s, error: %s" %
                                (arvixID, str(e)))
            time.sleep(0.3)
Example #7
0
    if args.random:
        args.start = int(np.random.uniform(low=0, high=10000))
        args.number = int(np.random.uniform(low=1, high=3))
        print("random mode, start: {}, number: {}".format(
            args.start, args.number))

    if args.field == 'cv':
        search_query = ['cs.CV']
    else:
        search_query = args.field

    print('Searching for {}'.format(search_query))

    articles = arxivpy.query(search_query=search_query,
                             start_index=args.start,
                             max_index=args.start + args.number,
                             results_per_iteration=100,
                             wait_time=5.0,
                             sort_by='lastUpdatedDate')  # grab 200 articles

    print("Available Keys: ", articles[0].keys())
    # print(articles[1])

    paperlist_file = open("paperlist.txt", "w")

    items = []
    for idx, article in enumerate(articles):
        items.append(
            "============================================ Paper {} ===========================================\n"
            .format(idx + 1))
        items.append("Title: \n    {}\n".format(article['title']))
        items.append("Author: \n    {}\n".format(article['authors']))
Example #8
0
def crawl_machine_learning(start_index: int, sort_order: str):
    # conn = connect_database()

    DBU = RDS_utils()

    check_axv = arxiv_id_check(DBU)

    machine_learning_categories = [
        'cs.CV', 'cs.CL', 'cs.LG', 'cs.AI', 'cs.NE', 'stat.ML', 'cs.MA'
    ]

    STEP = 100
    articles_per_minute = STEP * 2

    article_len = articles_per_minute

    # if start_index == -1:
    #     start_index = Article.get_n_articles(conn) - STEP

    logging.info('crawling start')
    logging.info('start index : ' + str(start_index))
    logging.info('sort_order : ' + sort_order)

    update_paper, insert_paper, insertfail, updatafail = 0, 0, 0, 0
    while article_len == articles_per_minute:
        # query 100 results per iteration
        # wait 30 seconds per query
        try:
            start = time.time()
            articles = arxivpy.query(search_query=machine_learning_categories,
                                     start_index=start_index,
                                     max_index=start_index +
                                     articles_per_minute,
                                     results_per_iteration=STEP,
                                     wait_time=5,
                                     sort_by='lastUpdatedDate',
                                     sort_order=sort_order)
            # crawling log
            # logging.info('last: ' + articles[-1].get('published', ''))
            logging.info(str(start_index + STEP * 2) + ' articles crawled')

            # save articles
            for article in articles:
                data = Article(article, None).tolist()

                print("'{}' cralwed / arxiv_id : {}".format(data[1], data[0]))
                axvid, pubyear = data[0], data[6]
                data[1], qt = get_qtitle(data[1])  # title
                if pubyear:
                    qt = qt.strip() + str(pubyear)
                pid = None
                if type(check_axv) == dict:
                    if check_axv.get(axvid):
                        pid = check_axv[axvid]
                else:
                    pid = DBU.get_pid_from_arXiv_id(axvid)
                if not pid:
                    pid = Duplication_check.check(qt)

                if pid:
                    ori = DBU.get_paper_by_p_id(pid)
                    data = Update_aXv_paper(ori, data)
                    if DBU.update_axv(pid, data):
                        update_paper += 1
                    else:
                        updatafail += 1
                else:
                    if DBU.insert_axv(data):
                        pid = DBU.get_pid_from_arXiv_id(axvid)
                        Duplication_check.insert_title_year(qt, pid)
                        insert_paper += 1
                    else:
                        insertfail += 1

            # compute start_index
            start_index += STEP * 2

            # compute article_len
            article_len = len(articles)
            e = int(time.tim() - start)
            print('took {:02d}:{:02d}:{:02d} to crawl {} paper'.format(
                e // 3600, (e % 3600 // 60), e % 60, article_len))

            # sleep 1 minute, no 30 seconds
            time.sleep(MINUTE / 2)
        except Exception as e:
            logging.error(e)
            print("insert fail : {}, update fail : {}".format(
                insertfail, updatafail))
            DBU.DB.conn.close()
            return start_index, insert_paper, update_paper

    print("insert fail : {}, update fail : {}".format(insertfail, updatafail))
    DBU.DB.conn.close()
    return start_index, insert_paper, update_paper