Exemple #1
0
def add_coauthor_relation(begin,end,msg):
    '''
    coauthor times and coauthor relationships
    :return:
    mag_authors0411:
    {coauthor_counts:n}
    {coauthor_list:[{year:1999,id:1000000},year:1998,id:1000001}]}
    '''
    start_time = time()
    print(start_time)
    col1 = connectTable("qiuzh", "mag_papers0415")
    col2 = connectTable("qiuzh", "mag_authors0411")
    operation = []
    cursor = col2.find(no_cursor_timeout=True)[begin:end]
    for i in cursor:
        author_id = i["_id"]
        coauthor_times = 0
        coauthor_list = []
        papers = i["new_pubs"]
        for paper in papers:
            paper_details = col1.find_one({"_id": paper})
            # if paper_details:
            coauthor_times += (len(paper_details["authors"]) - 1)
            for author in paper_details["authors"]:
                if author["id"] != author_id:
                    coauthor_list.append({"coauthor_id": author["id"], "coauthor_time": paper_details["year"]})
        if len(coauthor_list)>0:
            operation.append(pymongo.UpdateOne({"_id": author_id},
                                                {"$set": {"coauthor_counts": coauthor_times, "coauthor": coauthor_list}}))
    print(msg,"线程已完成",len(operation),flush=True)
    col2.bulk_write(operation, ordered=False)
    cursor.close()
    print(msg,time(), (time() - start_time))
Exemple #2
0
def author_citation_number(begin, end, msg):
    '''
    this function is appropriate for mag_authors0510 and citation network0515
    :return:
    '''
    colpaper = connectTable("qiuzh", "mag_papers0510")
    col_author = connectTable("qiuzh", "mag_authors0510")

    count = 0
    operation = []
    cursor = col_author.find(no_cursor_timeout=True)[begin:end]
    for author in cursor:
        count += 1
        author_id = author["_id"]
        citation_number = 0
        for paper in author["new_pubs"]:
            p = colpaper.find_one({"_id": paper["pid"]},
                                  no_cursor_timeout=True)
            citation_number += p["cn"]

        operation.append(
            pymongo.UpdateOne({"_id": author_id},
                              {"$set": {
                                  "cn": citation_number
                              }}))

        if count % 10000 == 0:
            print(msg, "已处理:", count / 10000, flush=True)
            col_author.bulk_write(operation, ordered=False)
            print(msg, "已写入:", count / 10000, flush=True)
            operation = []
            print(time(), flush=True)
    if operation:
        col_author.bulk_write(operation, ordered=False)
    cursor.close()
Exemple #3
0
def clone_collection():
    coll = connectTable("oga_one", "mag_paper_plus2")
    # col2 = connectTable("qiuzh","MAG_authors")
    col3 = connectTable("qiuzh", "papers")
    for i in coll.find({"$and": [{"venue": {"$exists": True}}]}):
        col3.insert_one(i)
    print(col3.find().count())
Exemple #4
0
def divide_researchers_into_2groups():
    col_author = connectTable("qiuzh", "mag_researchers0810")
    col1 = connectTable("qiuzh", "researchers0810_trainingset")
    col2 = connectTable('qiuzh', "researchers0810_testset")
    opt1 = []
    opt2 = []
    count = 0
    cursor = col_author.find(no_cursor_timeout=True)
    for researcher in col_author.find():
        count += 1
        if researcher["first_year"] <= 1996:
            opt1.append(pymongo.InsertOne(researcher))
        else:
            opt2.append(pymongo.InsertOne(researcher))

        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col1.bulk_write(opt1, ordered=False)
            print("已写入:", len(opt1), flush=True)
            col2.bulk_write(opt2, ordered=False)
            print("已写入:", len(opt2), flush=True)
            opt1 = []
            opt2 = []
    if opt1:
        col1.bulk_write(opt1, ordered=False)
        print("又写入:", len(opt1), flush=True)
    if opt2:
        col2.bulk_write(opt2, ordered=False)
        print("又写入:", len(opt2), flush=True)
    cursor.close()
def match_v1_v2_id(begin, end, msg):
    coll = connectTable("qiuzh", "mag_papers")
    coll3 = connectTable('qiuzh', "mag_authors0409")
    opt = []
    count = 0
    cursor = coll3.find(no_cursor_timeout=True)[begin:end]
    for i in cursor:
        if count % 100000 == 0:
            print("线程: %s, 已完成 %s 万条" % (msg, count / 100000), flush=True)
        count += 1
        v2author_id = i.get("_id")
        # print(v2author_id)
        new_pubs = []
        papers = coll.find({"authors.id": v2author_id})
        for paper in papers:
            id = paper.get("_id")
            new_pubs.append(id)
        opt.append(
            pymongo.UpdateOne({"_id": v2author_id},
                              {"$set": {
                                  "new_pubs": new_pubs
                              }}))

    cursor.close()
    coll3.bulk_write(opt, ordered=False)
    print("线程: %s, 遍历了 %s" % (msg, len(opt)))
def researchers_con():
    '''
    the coauthor times based on the mag_authors0510
    :return:
    '''
    col1 = connectTable('qiuzh', "mag_authors0510")
    col2 = connectTable('qiuzh', "mag_researchers0707")
    count = 0
    operation = []
    cursor = col2.find(no_cursor_timeout=True)
    for author in cursor:
        count += 1
        author_id = author["_id"]
        coauthor_number = col1.find_one({"_id": author_id})["con"]

        operation.append(
            pymongo.UpdateOne({"_id": author_id},
                              {"$set": {
                                  "con": coauthor_number
                              }}))

        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col2.bulk_write(operation, ordered=False)
            print("已写入:", count / 10000, flush=True)
            operation = []
    if operation:
        col2.bulk_write(operation, ordered=False)
        print("又处理", len(operation))
    cursor.close()
Exemple #7
0
def paper_citation_number(begin, end, msg):
    '''
    this function is appropriate for citation_network0515 and mag_papers0510
    :return: add each papers' total citation in mag_papers0510
    '''
    colpaper = connectTable("qiuzh", "mag_papers0510")
    col_citation_network = connectTable("qiuzh",
                                        "citation_network0810_trainingset")

    count = 0
    operation = []
    cursor = colpaper.find(no_cursor_timeout=True)[begin:end]
    for paper in cursor:
        count += 1
        paper_id = paper["_id"]
        citation_number = 0
        paper_citation_relations = col_citation_network.find(
            {"id": paper_id}, no_cursor_timeout=True)
        if paper_citation_relations:
            for paper_citation_relation in paper_citation_relations:
                citation_number += len(paper_citation_relation["citation"])
        operation.append(
            pymongo.UpdateOne({"_id": paper_id},
                              {"$set": {
                                  "cn_before1996": citation_number
                              }}))
        if count % 10000 == 0:
            print(msg, "已处理:", count / 10000, flush=True)
            colpaper.bulk_write(operation, ordered=False)
            print(msg, "已写入:", count / 10000, flush=True)
            operation = []
            print(time(), flush=True)
    if operation:
        colpaper.bulk_write(operation, ordered=False)
    cursor.close()
Exemple #8
0
def clone_author_collection():
    coll = connectTable("academic", "mag_authors")
    col2 = connectTable("qiuzh", "mag_authors0409")
    for i in coll.find({"id": {"$exists": True}}):
        if "pubs" in i.keys():
            new_document = {}
            new_document["_id"] = i["id"]
            new_document["pubs"] = i["pubs"]
            col2.insert_one(new_document)
    print(col2.find().count())
def filter_papers_by_JCR():
    '''
    把col1中的paper过滤到col2中,所有的paper的期刊必须都在JCR中出现过,即有field字段
    :return:
    '''
    col1 = connectTable("qiuzh", "mag_papers0415")
    col2 = connectTable("qiuzh", "mag_papers0510")
    cursor = col1.find({"field": {"$exists": True}}, no_cursor_timeout=True)
    for i in cursor:
        col2.insert_one(i)
def filter_authors_by_papers():
    '''
    save authors who have new pubs in mag_authors0411
    :return:
    '''
    col1 = connectTable("qiuzh", "mag_authors0409")
    col2 = connectTable('qiuzh', "mag_authors0411")

    for i in col1.find():
        if i["new_pubs"]:
            col2.insert_one(i)
Exemple #11
0
def clone_paper_collection():
    coll = connectTable("oga_one", "mag_paper")
    col2 = connectTable("qiuzh", "mag_papers")
    for i in coll.find({"id": {"$exists": True}}):
        if "new_authors" in i.keys() and "year" in i.keys() and "references" in i.keys():
            new_document = {}
            new_document["_id"] = i["id"]
            new_document["authors"] = i["new_authors"]
            new_document["venue"] = i["new_venue"]
            new_document["year"] = i["year"]
            new_document["references"] = i["references"]
            col2.insert_one(new_document)
    print(col2.find().count())
def researchers_collaboration_network():
    '''
    there are some problems in researchers_con_innewcollection network, so we may use the other method to replace it
    i.e create a collaboration network first.
    :param begin:
    :param end:
    :param msg:
    :return:
    '''

    start_time = time()
    print(start_time, flush=True)
    col1 = connectTable("qiuzh", "mag_papers0510")
    col2 = connectTable("qiuzh", "mag_researchers0707")
    col3 = connectTable("qiuzh", "coauthor_network0722")
    operation = []
    cursor = col2.find(no_cursor_timeout=True)
    count = 0
    for i in cursor:
        count += 1
        author_id = i["_id"]
        # coauthor_times = 0
        # coauthor_list = []
        papers = i["new_pubs"]
        for paper in papers:
            paper_details = col1.find_one({"_id": paper},
                                          no_cursor_timeout=True)
            for author in paper_details["authors"]:
                if author["id"] != author_id and col2.find_one(
                    {"_id": author["id"]}, no_cursor_timeout=True):
                    # coauthor_list.append({"coauthor_id": author["id"], "coauthor_time": paper_details["year"]})
                    operation.append(
                        pymongo.InsertOne({
                            "author_id":
                            author_id,
                            "coauthor_id":
                            author["id"],
                            "coauthor_time":
                            paper_details["year"],
                        }))
        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col3.bulk_write(operation, ordered=False)
            print("已写入:", count / 10000, flush=True)
            operation = []
            print(time(), flush=True)
    if operation:
        col3.bulk_write(operation, ordered=False)
    print("已完成", len(operation), flush=True)
    print(time(), (time() - start_time), flush=True)
Exemple #13
0
def add_coauthor_relation2newcollection():
    '''
    coauthor times and coauthor relationships
    :return:
    mag_authors0411:
    {coauthor_counts:n}
    {coauthor_list:[{year:1999,id:1000000},year:1998,id:1000001}]}
    because some of the authors in the dataset have too many collaborations and exceed the maximum RAM of a document,
    we store the relation in a new collection
    _id:
    "author_id" :
    "coauthor_id":
    "coauthor_time":
    '''

    start_time = time()
    print(start_time,flush=True)
    col1 = connectTable("qiuzh", "mag_papers0415")
    col2 = connectTable("qiuzh", "mag_authors0411")
    col3 = connectTable("qiuzh", "coauthor_network0420")
    operation = []
    cursor = col2.find(no_cursor_timeout=True)[3790001:]
    count =0
    for i in cursor:
        count+=1
        author_id = i["_id"]
        # coauthor_times = 0
        # coauthor_list = []
        papers = i["new_pubs"]
        for paper in papers:
            paper_details = col1.find_one({"_id": paper})
            # if paper_details:
            # coauthor_times += (len(paper_details["authors"]) - 1)
            for author in paper_details["authors"]:
                if author["id"] != author_id:
                    # coauthor_list.append({"coauthor_id": author["id"], "coauthor_time": paper_details["year"]})
                    operation.append(pymongo.InsertOne(
                        {"author_id": author_id, "coauthor_id": author["id"], "coauthor_time": paper_details["year"],
                         }))
        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col3.bulk_write(operation, ordered=False)
            print("已写入:", count / 10000, flush=True)
            operation = []
            print(time(), flush=True)
    if operation:
        col3.bulk_write(operation, ordered=False)
    print("已完成",len(operation),flush=True)
    print(time(), (time() - start_time), flush=True)
def filter_researchers_paper_by_authors():
    '''
    from mag_researchers0707(pubs>=10, academic career life >=10) to mag_researchers0810(only the author number of a
    paper less than 10 will be considered in the dataset)
    :param msg:
    :param begin:
    :param end:
    :return:
    this function is created in 2021.8.10
    '''
    # col2 = connectTable('qiuzh', "mag_researchers0707")
    # col2.drop()
    col1 = connectTable('qiuzh', "mag_researchers0707")
    col2 = connectTable('qiuzh', "mag_researchers0810")
    col_paper = connectTable("qiuzh", "mag_papers0510")
    cursor = col1.find(no_cursor_timeout=True)
    opt = []
    count = 0
    print(cursor.count())
    for i in cursor:
        count += 1
        pubs = i["new_pubs"]
        new_pubs = []
        for pub in pubs:
            paper = col_paper.find_one({"_id": pub["pid"]})
            if len(paper["authors"]) <= 10:
                new_pubs.append(pub)
        opt.append(
            pymongo.InsertOne({
                "_id": i["_id"],
                "new_pubs": new_pubs,
                "pub_count": i["pub_count"],
                "first_year": i["first_year"],
                "last_year": i["last_year"],
                "cn": i["cn"]
            }))
        if count % 10000 == 0:
            print(len(opt))
            print(count)
            print("已处理:", count / 10000, flush=True)
            col2.bulk_write(opt, ordered=False)
            print("已写入:", count / 10000, flush=True)
            opt = []
    if opt:
        col2.bulk_write(opt, ordered=False)
        print("最终又完成", len(opt))
    print(count)
    cursor.close()
def filter_author_by_careerlife(begin,end,msg):
    '''
    :param msg:
    :param begin:
    :param end:
    :return: pubs>=10, org exist(affiliation)
    '''
    col1 = connectTable('qiuzh', "mag_authors0421")
    col2 = connectTable('qiuzh', "mag_authors0411")
    cursor = col2.find(no_cursor_timeout=True)[begin:end]
    opt =[]
    for i in cursor:
        if i["first_year"]-i["last_year"]>=20:
            opt.append(pymongo.InsertOne({"_id":i["_id"],"new_pubs":i["new_pubs"],"pub_count":i["pub_count"],"first_year":i["first_year"],"last_year":i["last_year"]}))
    col1.bulk_write(opt,ordered=False)
    cursor.close()
def filter_papers_by_new_pubs():
    '''
    :param msg:
    :param begin:
    :param end:
    :return: new_pubs
    '''
    col1 = connectTable("qiuzh","papers")
    col2 = connectTable('qiuzh', "MAG_authors")
    medset =set()
    for i in col2.find():
        for j in i.get("new_pubs"):
            medset.add(j)
    print(len(medset))
    result=col1.delete_many({"id": {"$nin":list(medset)}})
    print(result.deleted_count)  # 被删除的个数
def filter_author_by_citation(begin, end,msg):
    '''
    :param msg: multi-process information
    :param begin: i-th
    :param end: i+1-th
    :return: pubs counts>=5
    '''
    col1 = connectTable("academic", "mag_authors")
    col2 = connectTable('qiuzh', "MAG_authors")
    opt = []
    # count = 0
    for i in col1.find({"n_pubs":{"$gte":5}})[begin: end]:
        a =i
        opt.append(pymongo.InsertOne(i))
    col2.bulk_write(opt, ordered=False)
    print("线程: %s, 遍历了 %s" % (msg, len(opt)))
def filter_author_by_abstract():
    '''
    Some of the papers in the dataset are news and some of the authors are editor of journals, so we need to filter them by abstract
    :return:
    '''
    col1 = connectTable("qiuzh", "mag_papers0415")
    print("okay")
Exemple #19
0
def find_critical_year():
    '''
        2020.8.31: critical_year is 1996, with 559808 rexearchers in total(more than half of the dataset)
        :return:
        '''

    col_author = connectTable("qiuzh", "mag_researchers0810")
    year_list = [
        1802, 1803, 1810, 1814, 1815, 1816, 1819, 1823, 1825, 1827, 1828, 1829,
        1830, 1832, 1833, 1834, 1836, 1838, 1839, 1841, 1842, 1843, 1844, 1845,
        1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857,
        1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869,
        1870, 1871, 1872, 1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881,
        1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893,
        1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905,
        1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917,
        1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929,
        1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941,
        1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953,
        1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965,
        1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977,
        1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
        1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
        2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
        2014, 2015, 2016, 2017, 2018
    ]
    # year_list = [1957, 1987]
    sum = 0
    for year in year_list:
        researcher_number = col_author.count_documents({"first_year": year})
        sum += researcher_number
        print(researcher_number, sum)
        if sum >= 541461:
            print(year)
Exemple #20
0
def author_pubs_count():
    '''
    author_pubs_count
    :return:
    '''
    coll = connectTable("qiuzh", "mag_authors0510")
    # coll = connectTable("qiuzh", "test1")
    a = coll.aggregate([{
        "$group": {
            "_id": {},
            "avg": {
                "$avg": "$pub_count"
            },
            "max": {
                "$max": "$pub_count"
            },
            "min": {
                "$min": "$pub_count"
            },
            "sum": {
                "$sum": "$pub_count"
            },
        }
    }])
    for i in a:
        print(i)
def print_bsur_into_pc():
    '''
    in 8.30 we used this function to print the data in the txt, However, we do save the data into Bsur rather than Bsur0810
    by mistake.
    :return:
    '''
    col1 = connectTable('qiuzh', "mag_researchers0810")
    DI = []
    KI = []
    SI = []
    for author in col1.find():
        d_i = author["bsur"]
        k_i = author["ifdis"]
        s_i = author["iftop"]
        DI.append(d_i)
        KI.append(k_i)
        SI.append(s_i)

    print("list has loaded")
    data = open("C:/Users/qzh/PycharmProjects/MAG/datafile/Bsur0810.txt", "w+")
    for j in range(len(DI)):
        print(DI[j], file=data)
    data.close()

    data = open("C:/Users/qzh/PycharmProjects/MAG/datafile/Ifdis0810.txt",
                "w+")
    for j in range(len(KI)):
        print(KI[j], file=data)
    data.close()

    data = open("C:/Users/qzh/PycharmProjects/MAG/datafile/Iftop0810.txt",
                "w+")
    for j in range(len(SI)):
        print(SI[j], file=data)
    data.close()
Exemple #22
0
def calculate_coauthor_times2():
    '''
    this version is appropriate for mag_authors0510
    this function is not finished
    :return:
    '''
    start_time = time()
    print(start_time)
    col1 = connectTable("qiuzh", "mag_authors0411")
    operation = []
    count=0
    cursor = col1.find(no_cursor_timeout=True)
    for i in cursor:
        count+=1
        author_id = i["_id"]
        coauthor_times = col1.count({"author_id": author_id})
        operation.append(pymongo.UpdateOne({"_id": author_id}, {"$set": {"coauthor_times": coauthor_times}}))
        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col1.bulk_write(operation, ordered=False)
            print("已写入:", count / 10000, flush=True)
            operation = []
            print(time(), flush=True)
    if operation:
        col1.bulk_write(operation, ordered=False)
    cursor.close()
    print(time(), (time() - start_time))
Exemple #23
0
def boot_strap(P_d):
    col_author = connectTable("qiuzh", "researchers0810_trainingset")
    cursor = col_author.find(no_cursor_timeout=True)
    count = 0
    operation = []
    for author in cursor:
        count += 1
        coauthor_times = author["new_con"]
        author_id = author["_id"]
        d_i_list = np.random.binomial(coauthor_times, P_d, 20)
        surprisal_list = []
        for di in d_i_list:
            P0 = stats.binom.sf(di - 1, coauthor_times, P_d)
            surprisal_list.append(-math.log(P0))
        S = np.mean(surprisal_list)
        operation.append(
            pymongo.UpdateOne({"_id": author_id}, {"$set": {
                "bsur": S
            }}))

        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col_author.bulk_write(operation, ordered=False)
            print("已写入:", count / 10000, flush=True)
            operation = []
            print(time(), flush=True)
    if operation:
        col_author.bulk_write(operation, ordered=False)
        print("又写入并完成", len(operation))
    cursor.close()
    print(col_author.count_documents({"sur": -6}))
    print(col_author.count_documents({"dn": -1}))
    print(col_author.count_documents({"bsur": -6}))
Exemple #24
0
def find_discoverer(maxbsur):
    col_author = connectTable("qiuzh", "researchers0810_trainingset")
    cursor = col_author.find(no_cursor_timeout=True)
    count = 0
    operation = []
    for author in cursor:
        count += 1
        sur = author["sur"]
        author_id = author["_id"]
        if sur >= 0 and sur < maxbsur:
            operation.append(
                pymongo.UpdateOne({"_id": author_id}, {"$set": {
                    "ifdis": 0
                }}))
        else:
            operation.append(
                pymongo.UpdateOne({"_id": author_id}, {"$set": {
                    "ifdis": 1
                }}))

        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col_author.bulk_write(operation, ordered=False)
            print("已写入:", count / 10000, flush=True)
            operation = []
            print(time(), flush=True)
    if operation:
        col_author.bulk_write(operation, ordered=False)
        print("又写入并完成", len(operation))
    cursor.close()
Exemple #25
0
def initialize_surprisal():
    col_author = connectTable("qiuzh", "researchers0810_trainingset")

    cursor = col_author.find(no_cursor_timeout=True)
    # researcher_number = cursor.count()
    # print(researcher_number)
    count = 0
    operation = []
    for author in cursor:
        count += 1
        operation.append(
            pymongo.UpdateOne({"_id": author["_id"]},
                              {"$set": {
                                  "sur": -6,
                                  "bsur": -6
                              }}))

        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col_author.bulk_write(operation, ordered=False)
            print("已写入:", count / 10000, flush=True)
            operation = []
    if operation:
        col_author.bulk_write(operation, ordered=False)
    print("finished")
    cursor.close()
    print(count)
    print(col_author.find({"dn": -1}, no_cursor_timeout=True).count())
Exemple #26
0
def initialize_discover_number():
    '''
    this function is used in 2021.8.12 in mag_researchers0810
    in 2021.9.1 we used this function in researchers0810_trainingset
    :return:
    '''
    col_author = connectTable("qiuzh", "researchers0810_trainingset")

    cursor = col_author.find(no_cursor_timeout=True)
    # researcher_number = cursor.count()
    # print(researcher_number)
    count = 0
    operation = []
    for author in cursor:
        count += 1
        operation.append(
            pymongo.UpdateOne({"_id": author["_id"]}, {"$set": {
                "dn": -1
            }}))

        if count % 10000 == 0:
            print("已处理:", count / 10000, flush=True)
            col_author.bulk_write(operation, ordered=False)
            print("已写入:", count / 10000, flush=True)
            operation = []
    if operation:
        col_author.bulk_write(operation, ordered=False)
    print("finished")
    cursor.close()
    print(count)
    print(col_author.find({"dn": -1}, no_cursor_timeout=True).count())
Exemple #27
0
def new_pub_count(begin, end, msg):
    col_author = connectTable("qiuzh", "researchers0810_trainingset")
    count = 0
    operation = []
    cursor = col_author.find(no_cursor_timeout=True)[begin:end]
    for author in cursor:
        count += 1
        author_id = author["_id"]
        pub_count = 0
        for paper in author["new_pubs"]:
            if paper["year"] <= 1996:
                pub_count += 1
        operation.append(
            pymongo.UpdateOne({"_id": author_id},
                              {"$set": {
                                  "pub_count": pub_count
                              }}))

        if count % 10000 == 0:
            print(msg, "已处理:", count / 10000, flush=True)
            col_author.bulk_write(operation, ordered=False)
            print(msg, "已写入:", count / 10000, flush=True)
            operation = []
            print(time(), flush=True)
    if operation:
        col_author.bulk_write(operation, ordered=False)
    cursor.close()
Exemple #28
0
def author_first_year_distribution():
    '''

    :return:
    '''
    col = connectTable("qiuzh", "mag_authors0510")
    yearlist = col.distinct("first_year")
    print(yearlist)
def delete_coauthor_counts():
    col = connectTable('qiuzh', "mag_authors0411")
    # cursor = col.find({"coauthor_counts":{"$exists": True}},no_cursor_timeout=True)[begin:end]
    # for i in cursor:
    #     _id = i.get("_id")
    #     col.update_one({"_id":_id}, {"$unset": {"new_pubs": 1}},False,True)
    col.update_many({"coauthor_counts": {"$exists": True}}, {"$unset": {"coauthor_counts": 1, "coauthor": 1}})
    # cursor.close()
    print("yes okay")
Exemple #30
0
def author_pubs_number():
    mycol = connectTable("qiuzh", "mag_authors0510")
    for i in mycol.find():
        author_id = i["_id"]
        pub_number = len(i["new_pubs"])
        mycol.update_one({"_id": author_id},
                         {"$set": {
                             "pub_count": pub_number
                         }})