Exemple #1
0
def main():
    papers = Paper.select().where((Paper.year == 2015))
    for paper in tqdm(papers):
        try:
            download_extract(paper)
        except KeyboardInterrupt:
            sys.exit()
Exemple #2
0
def query_by_time():
    now = datetime.now()
    n_days = datetime(now.year, now.month, now.day) - timedelta(days=3)
    papers = Paper.select().where(Paper.post_time > n_days).order_by(
        Paper.post_time.desc()).execute()
    data = list(map(model_to_dict, papers))
    result = {'code': 0, 'data': data}
    return jsonify(result)
Exemple #3
0
def crawl_author():
    papers = Paper.select().execute()
    for paper in tqdm(papers):
        for author_id in paper.authorID:
            if author_id == -1:
                continue
            query = Author.select().where(Author.authorID == author_id)
            if query.exists():
                continue
            # try:
            success, author_profile = get_author_data(author_id)
            if success is False:
                logging.info("Author %d not exist!" % author_id)
                continue
            estCitation = author_profile['statistics'][
                'estCitationAcceleration']['estimate']

            influencedIDList = author_profile['statistics']['influence'][
                'influenced']
            influencedIDName = [
                inf['author']['ids'][0] for inf in influencedIDList
            ]
            influenceCount = len(influencedIDName)

            influencedByIDList = author_profile['statistics']['influence'][
                'influencedBy']
            influencedByIDName = [
                inf['author']['ids'][0] for inf in influencedByIDList
            ]
            influenceByCount = len(influencedIDName)

            author = Author.create(
                authorID=author_id,
                name=author_profile['name'],
                semanticScholarUrl=author_profile['url'],
                hIndex=author_profile['statistics']['hIndex'],
                influentialCitationCount=author_profile[
                    'influentialCitationCount'],
                citationVelocity=author_profile['citationVelocity'],
                totalInfluentialCitationCount=author_profile['statistics']
                ['totalInfluentialCitationCount'],
                maxEstCitationAcceleration=estCitation['max'],
                minEstCitationAcceleration=estCitation['min'],
                estCitationAcceleration=estCitation['value'],
                estCitationAccelerationConfidence=estCitation['confidence'],
                influencedIDList=influencedIDName,
                influencedPaper=influencedIDList,
                influenceCount=influenceCount,
                influencedByIDList=influencedByIDName,
                influencedByPaper=influencedByIDList,
                influenceByCount=influenceByCount,
                citationHistory=author_profile['statistics']
                ['citedByYearHistogram'],
                totalPaper=len(author_profile['papers']),
            )
            author.save()
            time.sleep(0.3)
Exemple #4
0
def query_by_hot():
    now = datetime.now()
    n_days = datetime(now.year, now.month, now.day) - timedelta(days=3)
    papers = Paper.select().where(Paper.post_time > n_days).order_by(
        Paper.read_num.desc()).execute()
    data = list(map(model_to_dict, papers))
    remove_keys = ['author', 'content', 'wx_name', 'add_time']
    for item in data:
        for key in remove_keys:
            item.pop(key)
    result = {'code': 0, 'data': data}
    return jsonify(result)
def admissions():
    all_years = YearDW.select()

    for y in all_years:
        all_congresses = Congress.select(Congress.idCongress) \
            .where(Congress.submissionDeadline.year == y.congressyear)
        for c in all_congresses:
            rel_congress_paper = Congress_Paper.select(Congress_Paper.idPaper) \
                .where(Congress_Paper.idCongress == c.idCongress)
            rel_congress = [r.idPaper for r in rel_congress_paper]
            autors_papers = [
                a.idParticipant for a in Autor.select(
                    Autor.idParticipant).where(Autor.idPaper << rel_congress)
            ]
            all_autors = AutorDW.select(
                AutorDW.idautor).where(AutorDW.idautor << autors_papers)
            for l_autor in all_autors:
                rel_autor = [
                    a.idPaper for a in Autor.select().where(
                        Autor.idParticipant == l_autor.idautor,
                        Autor.idPaper << rel_congress)
                ]
                n_refused = Paper.select() \
                    .where(Paper.accepted == False,
                           Paper.paperId << rel_autor,
                           Paper.paperId << rel_congress) \
                    .count()
                n_accepted = Paper.select() \
                    .where(Paper.accepted,
                           Paper.paperId << rel_autor,
                           Paper.paperId << rel_congress) \
                    .count()

                if (n_refused + n_accepted) > 0:
                    AdmissionsDW.get_or_create(idadmcongress=c.idCongress,
                                               idadmautor=l_autor.idautor,
                                               accepted=n_accepted,
                                               refused=n_refused,
                                               idadmyear=y.idyear)
Exemple #6
0
def update_paper():
    all_papers = Paper.select()
    counter = 0
    for p in all_papers:
        counter += 1
        reviews = Review.select().where(Review.idPaper == p.paperId)
        num_reviews = len(reviews)
        sum_score = sum([r.score for r in reviews])
        avg_score = int(sum_score / num_reviews)
        p.finalScore = int(sum_score / num_reviews)
        if num_reviews < 3 or avg_score < 7:
            p.accepted = False
        p.save()

        if (counter % 10000) == 0:
            print(f"{counter} papers updated")
Exemple #7
0
def run():
    gs = GSData()
    db.connect()

    with open('data.txt', encoding='utf-8') as f:
        for line in f:
            wx_name = line.split(' ')[0]
            try:
                data = gs.query(wx_name)
            except GSException as ex:
                print(str(ex))
                continue
            else:
                for item in data:
                    url = 'https://{0}'.format(item['url'].split('://', 1)[1])
                    md5s = hashlib.md5(url.encode('utf-8')).hexdigest()
                    if Paper.select().where(Paper.url_hash == md5s).count():
                        continue
                    print(item)
                    p = Paper.create(
                        wx_name=item['wx_name'],
                        name=item['name'],
                        title=item['title'],
                        author=item['author'],
                        content=item['content'],
                        url=url,
                        url_hash=md5s,
                        post_time=datetime.strptime(item['posttime'],
                                                    '%Y-%m-%d %H:%M:%S'),
                        add_time=datetime.strptime(item['add_time'],
                                                   '%Y-%m-%d %H:%M:%S'))
                    if type(item['readnum_newest']) == int:
                        p.read_num = item['readnum_newest']
                    if type(item['likenum_newest']) == int:
                        p.like_num = item['likenum_newest']
                    if item['picurl']:
                        p.pic_url = item['picurl']

                    p.save()
            sleep(3)

    db.close()
Exemple #8
0
def update_paper():
    idx = 0
    for filename in tqdm(glob.glob("oai/*.xml")):
        article = parse_xml_file(filename)
        if article is None or idx < 346728:
            idx += 1
            continue
        arvixID = article['id'].split('v')[0]
        query = Paper.select().where(Paper.arvixID == arvixID)
        if query.exists():
            continue
        success, article_meta = get_arvixpaper_semantic_scholar(arvixID)
        if success is False:
            logging.debug(
                "Paper not exists in semantic scholar, arvixID : %s" % arvixID)
            continue
        authorIDList = [
            int(author['authorId']) if author['authorId'] is not None else -1
            for author in article_meta['authors']
        ]
        authorNames = [article['main_author']]
        authorCount = len(article_meta['authors'])
        if authorCount > 1:
            other_author = [
                name.strip() for name in article['authors'].split(',')
                if len(name) > 1 and name != article['main_author']
            ]
            authorNames += other_author
        paper_category = [article['term']]
        try:
            paper = Paper.create(
                indexID=idx,
                arvixID=arvixID,
                paperId=article_meta['paperId'],
                doiID=str(article_meta['doi']),
                title=article['title'],
                summary=article['abstract'],
                category=paper_category,
                comments=article['comment'],
                journal_ref=article['journal_ref'],
                url=article['url'],
                authorID=authorIDList,
                authorName=authorNames,
                authorCount=authorCount,
                publishedDate=article['publish_date'],
                citationVelocity=article_meta['citationVelocity'],
                referencesCount=len(article_meta['references']),
                topics=article_meta['topics'],
                venue=str(article_meta['venue']),
                year=article_meta['year'],
                influentialCitationCount=article_meta[
                    'influentialCitationCount'],
                citationCount=len(article_meta['citations']),
                citations=article_meta['citations'],
            )
            try:
                for meta in ['page', 'figure', 'table']:
                    if meta in article['comment']:
                        comment = article['comment'].replace(';', ',')
                        for segment in comment.split(','):
                            if meta in segment:
                                page_prefix = segment.split(meta)[0]
                                if meta == 'page':
                                    paper.pages = int(page_prefix.strip())
                                elif meta == 'figure':
                                    paper.figures = int(page_prefix.strip())
                                elif meta == 'table':
                                    paper.table = int(page_prefix.strip())
                                break
            except:
                logging.debug("Error in parsing meta data")
            paper.save()
        except BaseException as e:
            logging.warning("Error in arvix id %s, error: %s" %
                            (arvixID, str(e)))
        time.sleep(0.2)
        idx += 1
Exemple #9
0
def crawl_category(term='cs.LG'):
    index_iteration = 500
    logging.info("Crawling category : %s", term)
    for index in range(start_index, end_index, index_iteration):
        logging.info("\nBatch : %d-%d" % (index, index + index_iteration))
        articles = arxivpy.query(search_query=[term],
                                 start_index=index,
                                 max_index=index + index_iteration,
                                 results_per_iteration=index_iteration,
                                 wait_time=0.2,
                                 sort_by='lastUpdatedDate')
        article_batch_count = len(articles)
        if article_batch_count == 0:
            logging.warning('Article not found in batch %d - %d' %
                            (index, index + index_iteration))
        for idx, article in tqdm(enumerate(articles),
                                 total=article_batch_count):
            arvixID = article['id'].split('v')[0]
            query = Paper.select().where(Paper.arvixID == arvixID)
            if query.exists():
                paper = Paper.get(Paper.arvixID == arvixID)
                categories = paper.category
                if term not in categories:
                    categories.append(term)
                Paper.update(category=categories).where(
                    Paper.arvixID == arvixID).execute()
                continue
            success, article_meta = get_arvixpaper_semantic_scholar(arvixID)
            if success is False:
                logging.debug(
                    "Paper not exists in semantic scholar, arvixID : %s" %
                    arvixID)
                continue
            authorIDList = [
                int(author['authorId'])
                if author['authorId'] is not None else -1
                for author in article_meta['authors']
            ]
            authorNames = [article['main_author']]
            authorCount = len(article_meta['authors'])
            if authorCount > 1:
                other_author = [
                    name.strip() for name in article['authors'].split(',')
                    if len(name) > 1 and name != article['main_author']
                ]
                authorNames += other_author
            paper_category = [article['term']]
            if article['term'] != term:
                paper_category.append(term)
            try:
                paper = Paper.create(
                    indexID=idx + index,
                    arvixID=arvixID,
                    paperId=article_meta['paperId'],
                    doiID=str(article_meta['doi']),
                    title=article['title'],
                    summary=article['abstract'],
                    category=paper_category,
                    comments=article['comment'],
                    journal_ref=article['journal_ref'],
                    url=article['url'],
                    authorID=authorIDList,
                    authorName=authorNames,
                    authorCount=authorCount,
                    publishedDate=article['publish_date'],
                    citationVelocity=article_meta['citationVelocity'],
                    referencesCount=len(article_meta['references']),
                    topics=article_meta['topics'],
                    venue=str(article_meta['venue']),
                    year=article_meta['year'],
                    influentialCitationCount=article_meta[
                        'influentialCitationCount'],
                    citationCount=len(article_meta['citations']),
                    citations=article_meta['citations'],
                )
                try:
                    for meta in ['page', 'figure', 'table']:
                        if meta in article['comment']:
                            comment = article['comment'].replace(';', ',')
                            for segment in comment.split(','):
                                if meta in segment:
                                    page_prefix = segment.split(meta)[0]
                                    if meta == 'page':
                                        paper.pages = int(page_prefix.strip())
                                    elif meta == 'figure':
                                        paper.figures = int(
                                            page_prefix.strip())
                                    elif meta == 'table':
                                        paper.table = int(page_prefix.strip())
                                    break
                except:
                    logging.debug("Error in parsing meta data")
                paper.save()
            except BaseException as e:
                logging.warning("Error in arvix id %s, error: %s" %
                                (arvixID, str(e)))
            time.sleep(0.3)