Ejemplo n.º 1
0
def splitFiles(type):
    # type = 'head'
    if type == 'head':
        sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and %s = 1 and id in (1702);' % (
            type)
    else:
        sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and head = 1 and %s = 1 and id in (1702);' % (
            type)
    print(sql)
    papers = pd.read_sql(sql, con=db)

    for index, row in papers.iterrows():
        #     row = [30092, 9,]
        if type == 'head':
            # print ('entra')
            res = _getHead(row['id'], row['npages'])
        elif type == 'tail':
            res = _getTail(row['id'], row['npages'])

        if res:
            sql = "update resolved_papers set %s = 1 where id = %s" % (type,
                                                                       row[0])
            try:
                cur.execute(sql)
                db.commit()
            except:
                db.rollback()
            print("Id: %s. %s: %s" % (row[0], type.title(), res))
    cur.close()
Ejemplo n.º 2
0
def _countOccurencies(papers):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        id, title = papers

        # title = 'CLEU‐A Cross‐Language English‐Urdu Corpus and Benchmark for Text Reuse Experiments'

        keywords = ["Cross-language".lower().strip(),
                    "Crosslanguage".lower().strip(),
                    "Cross-lingual".lower().strip(),
                    "Crosslingual".lower().strip(),
                    "Cross-linguistic".lower().strip(),
                    "Crosslinguistic".lower().strip(),
                    "Multi-language".lower().strip(),
                    "Multilanguage".lower().strip(),
                    "Multi-lingual".lower().strip(),
                    "Multilingual".lower().strip(),
                    "Multi-linguistic".lower().strip(),
                    "Multilinguistic".lower().strip(),
                    "Machine-translation".lower().strip(),
                    "Copy".lower().strip(),
                    "Duplicate".lower().strip(),
                    "Plagiarism".lower().strip(),
                    "Detection".lower().strip(),
                    "Discovery".lower().strip()]
        nkeywords = len(keywords)
        text = _processText(title)
        words = _processNL(text)
        fdist = nltk.FreqDist(words)


        i = 0
        while i < nkeywords:

            if fdist[str(keywords[i]).lower()] > 0:
                sql = "insert into resolved_papers_title_occurrencies values (%s, '%s', %s);" % (
                    id, str(keywords[i]).lower(), fdist[str(keywords[i]).lower()])
                # print (sql)
                try:
                    cur.execute(sql)
                    db.commit()
                    print('saved')
                except:
                    db.rollback()
            i += 1
    except:
        db.rollback()
        print('no saved')
    cur.close()
Ejemplo n.º 3
0
def classifyPub():
    sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and head = 1 and tail = 1 and type is NULL;'
    papers = pd.read_sql(sql, con=db)

    for index, row in papers.iterrows():
        res = _classifyPub(row[0], row[1])

        if res:
            sql = "update resolved_papers set type = '%s' where id = %s" % (
                res, row[0])
            print(sql)
            try:
                cur.execute(sql)
                db.commit()
            except:
                db.rollback()
            print("Id: %s. type: %s" % (row[0], res))
    cur.close()
Ejemplo n.º 4
0
def updateNumPages():
    sql = 'select id from resolved_papers where downloaded = 1 and npages is NULL;'
    papers = pd.read_sql(sql, con=db)

    for index, row in papers.iterrows():
        # print (row['id'])
        pages = _getNPages(row['id'])
        # print ((row['id'], pages))
        if pages:

            sql = "update resolved_papers set npages = %s where id = %s" % (
                pages, row[0])
            try:
                cur.execute(sql)
                db.commit()
            except:
                db.rollback()
            print("Id: %s. Num Pages: %s" % (row[0], pages))
    cur.close()
Ejemplo n.º 5
0
def _titlesLang(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        id, title = ids
        lang = _checkTitle(title)
        sql = "insert into resolved_papers_title values (%s, '%s');" % (id,
                                                                        lang)
        print(sql)
        cur.execute(sql)
        db.commit()
        print("saved")
    except:
        db.rollback()
        print('no saved')
    cur.close()
Ejemplo n.º 6
0
def languageDetection():
    # sql = 'select id from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 0 and id in (12,	70,	74,	77,	92,	108,	110,	111,	113,	127,	128,	129,	133,	136,	145,	149,	151,	189,	210,	223,	238,	247,	253,	276,	287,	289,	291,	292,	303,	308,	345,	346,	347,	349,	350,	351,	354,	355,	359,	360,	361,	362,	363,	364,	365,	368,	377,	381,	389,	393,	395,	406,	414,	424,	439,	446,	448,	549,	554,	558,	574,	577,	578,	579,	581,	582,	583,	585,	588,	589,	591,	592,	595,	597,	601,	604,	605,	609,	613,	621,	625,	682,	684,	712,	713,	714,	715,	716,	717,	719,	722,	723,	724,	726,	730,	731,	732,	734,	735,	738,	739,	740,	743,	749,	751,	752,	753,	754,	755,	758,	765,	782,	787,	816,	822,	830,	836,	851,	857,	860,	861,	869,	882,	970,	1044,	1045,	1047,	1050,	1052,	1055,	1056,	1057,	1058,	1060,	1061,	1062,	1063,	1064,	1065,	1066,	1068,	1069,	1072,	1073,	1074,	1075,	1076,	1079,	1080,	1083,	1084,	1086,	1087,	1089,	1094,	1100,	1104,	1105,	1106,	1115,	1116,	1117,	1122,	1124,	1125,	1126,	1131,	1133,	1142,	1143,	1146,	1150,	1151,	1172,	1174,	1176,	1184,	1194,	1248,	1283,	1301,	1307,	1309,	1367,	1381,	1417,	1419,	1452,	1456,	1482,	1491,	1507,	1511,	1513,	1522,	1542,	1562,	1585,	1587,	1591,	1624,	1626,	1628,	1652,	1687,	1688,	1689,	1692,	1693,	1694,	1696,	1698,	1699,	1701,	1704,	1710,	1711,	1714,	1716,	1719,	1720,	1727,	1728,	1730,	1745,	1750,	1751,	1755,	1757,	1770,	1809,	1815,	1820,	1831,	1835,	1872,	1884,	1887,	1898,	1935,	1955,	1993,	2009,	2025,	2026,	2029,	2030,	2031,	2199,	2241,	2244,	2246,	2275,	2276,	2277,	2278,	2279,	2305,	2323,	2324,	2325,	2327,	2328,	2347,	2360,	2402,	2404,	2410,	2415,	2442,	2448,	2450,	2451,	2452,	2461,	2462,	2467,	2477,	2509,	2510,	2512,	2513,	2518,	2522,	2524,	2531,	2543,	2547,	2554,	2555,	2576,	2577,	2578,	2579,	2580,	2583,	2586,	2605,	2609,	2624,	2629,	2646,	2651,	2652,	2653,	2655,	2656,	2659,	2661,	2662,	2671,	2676,	2677,	2756,	2757,	2758,	2760,	2761,	2762,	2768,	2771,	2772,	2773,	2774,	2776,	2777,	2781,	2782,	2783,	2786,	2789,	2790,	2791,	2792,	2793,	2794,	2795,	2798,	2811,	2815,	2822,	2869,	2884,	2907,	2913,	2920,	2924,	3029,	3127,	3141,	3146,	3172,	3173,	3174,	3175,	3176,	3177,	3178,	3180,	3182,	3183,	3184,	3185,	3189,	3192,	3194,	3198,	3199,	3202,	3203,	3207,	3208,	3211,	3223,	3224,	3230,	3236,	3252,	3253,	3262,	3275,	3302,	3305,	3316,	3365,	3388,	3389,	3391,	3392,	3396,	3397,	3398,	3399,	3400,	3401,	3402,	3405,	3406,	3408,	3409,	3412,	3415,	3416,	3418,	3419,	3420,	3421,	3422,	3423,	3424,	3425,	3426,	3427,	3428,	3431,	3432,	3433,	3436,	3438,	3439,	3443,	3444,	3445,	3446,	3450,	3452,	3455,	3456,	3458,	3461,	3466,	3467,	3470,	3503,	3526,	3532,	3536,	3538,	3541,	3542,	3543,	3549,	3563,	3573,	3597,	3598,	3620,	3626,	3662,	3819,	3921,	3922,	3923,	3925,	3927,	3931,	3932,	3933,	3934,	3935,	3936,	3937,	3938,	3939,	3940,	3942,	3943,	3944,	3945,	3948,	3950,	3952,	3953,	3954,	3955,	3957,	3958,	3959,	3960,	3961,	3963,	3965,	3966,	3967,	3968,	3971,	3972,	3980,	3988,	3995,	4000,	4005,	4011,	4039,	4043,	4046,	4048,	4050,	4059,	4077,	4086,	4089,	4098,	4101,	4104,	4109,	4111,	4123,	4127,	4170,	4184,	4203,	4215,	4221,	4235,	4287,	4295,	4345,	4362,	4367,	4448,	4449,	4451,	4452,	4453,	4454,	4455,	4457,	4458,	4459,	4460,	4461,	4462,	4463,	4464,	4465,	4466,	4467,	4468,	4469,	4470,	4472,	4478,	4480,	4481,	4482,	4496,	4500,	4504,	4508,	4513,	4518,	4523,	4524,	4548,	4551,	4567,	4572,	4598,	4607,	4608,	4611,	4657,	4786,	4788,	4789,	4791,	4792,	4793,	4794,	4795,	4796,	4797,	4798,	4799,	4804,	4805,	4811,	4815,	4817,	4819,	4829,	4839,	4840,	5037,	5038,	5040,	5047,	5179,	5191,	5192,	5210,	5248,	5249,	5266,	5275,	5276,	5322,	5323,	5327,	5330,	5362,	5410,	5411,	5416,	5451,	5462,	5493,	5494,	5496,	5519,	5536,	5548,	5555,	5587,	5588,	5589,	5590,	5591,	5594,	5599,	5617,	5633,	5636,	5660,	5667,	5695,	5697,	5701,	5702,	5706,	5767,	5768,	5769,	5773,	5778,	5786,	5831,	5832,	5833,	5835,	5836,	5837,	5839,	5844,	5849,	5850,	5858,	5860,	5889,	5901,	5915,	5916,	5918,	5920,	5991,	5992,	5993,	5994,	5995,	6009,	6045,	6079,	6080,	6081,	6083,	6084,	6085,	6086,	6087,	6100,	6101,	6107,	6185,	6249,	6278,	6279,	6280,	6281,	6282,	6283,	6285,	6305,	6306,	6387,	6393,	6396,	6397,	6398,	6411,	6439,	6498,	6505,	6511,	6513,	6518,	6520,	6524,	6525,	6526,	6527,	6532,	6543,	6553,	6555,	6565,	6566,	6569,	6573,	6574,	6581,	6585,	6601,	6605,	6606,	6612,	6615,	6617,	6621,	6645,	6646,	6648,	6651,	6652,	6658,	6660,	6667,	6672,	6676,	6682,	6684,	6688,	6690,	6692,	6693,	6700,	6704,	6743,	6769,	6771,	6772,	6775,	6778,	6783,	6785,	6789,	6793,	6818,	6824,	6829,	6830,	6834,	6839,	6845,	6846,	6849,	6850,	6855,	6859,	6866,	6873,	6878,	6887,	6888,	6889,	6890,	6907,	6926,	6945,	6948,	6954,	6963,	7006,	7066,	7082,	7102,	7121,	7162,	7163,	7271,	7272,	7273,	7285,	7314,	7315,	7350,	7362,	7364,	7398,	7441,	7442,	7443,	7444,	7446,	7451,	7454,	7456,	7462,	7464,	7504,	7515,	7516,	7547,	7548,	7634,	7659,	7660,	7661,	7662,	7663,	7664,	7665,	7672,	7776,	7777,	7783,	7784,	7788,	7789,	7792,	7795,	7797,	7798,	7799,	7809,	7831,	7889,	7917,	7918,	7920,	7926,	7930,	7932,	7933,	7935,	7936,	7941,	7944,	7960,	7962,	7971,	8008,	8017,	8070,	8075,	8076,	8110,	8111,	8112,	8117,	8120,	8128,	8129,	8130,	8133,	8136,	8140,	8143,	8144,	8145,	8148,	8149,	8150,	8153,	8154,	8159,	8163,	8203,	8225,	8268,	8270,	8302,	8310,	8312,	8419,	8421,	8496,	8497,	8498,	8500,	8505,	8506,	8507,	8508,	8510,	8513,	8517,	8533,	8543,	8584,	8710,	8717,	8718,	8719,	8720,	8721,	8722,	8724,	8726,	8730,	8732,	8733,	8734,	8737,	8739,	8740,	8741,	8742,	8743,	8744,	8745,	8747,	8748,	8750,	8751,	8752,	8753,	8754,	8755,	8756,	8757,	8759,	8761,	8764,	8766,	8768,	8769,	8773,	8774,	8775,	8784,	8811,	8817,	9042,	9056,	9207,	9219,	9240,	9249,	9273,	9318,	9322,	9422,	9457,	9485,	9562,	9623,	9647,	9836,	9837,	9922,	10067,	10068,	10069,	10168,	10185,	10288,	10400,	10401,	10513,	10515,	10606,	10700,	10702,	10703,	10771,	10772,	10819,	10821,	10927,	11019,	11056,	11113,	11142,	11143,	11225,	11226,	11227,	11343,	11361,	11362,	11364,	11377,	11448,	11460,	11461,	11462,	11463,	11465,	11466,	11468,	11493,	11609,	11610,	11611,	11617,	11638,	11659,	11718,	11748,	11749,	11750,	11751,	11762,	11821,	11850,	11891,	11898,	11911,	11913,	11914,	11915,	11916,	11917,	11918,	11919,	11920,	11921,	11922,	11923,	11926,	11928,	11934,	11955,	11980,	12026,	12030,	12044,	12092,	12093,	12094,	12095,	12096,	12098,	12100,	12101,	12102,	12103,	12104,	12105,	12106,	12107,	12108,	12109,	12110,	12111,	12112,	12113,	12114,	12122,	12123,	12125,	12144,	12147,	12234,	12235,	12237,	12256,	12305,	12339,	12346,	12407,	12448,	12511,	12665,	12705,	12706,	12708,	12709,	12710,	12711,	12712,	12713,	12714,	12716,	12717,	12718,	12719,	12720,	12721,	12722,	12725,	12729,	12742,	12753,	12762,	12802,	12813,	12816,	12821,	12823,	12843,	12856,	12905,	12907,	13006,	13061,	13062,	13063,	13137,	13138,	13198,	13329,	13330,	13331,	13332,	13494,	13495,	13582,	13583,	13584,	13585,	13586,	13697,	13833,	13834,	13835,	13836,	13837,	13840,	14160,	14161,	14200,	14341,	14342,	14343,	14590,	14591,	14597,	14610,	14614,	14631,	14632,	14633,	14634,	14635,	14650,	14655,	14656,	14689,	14726,	14777,	14870,	14871,	14872,	14921,	14922,	14923,	14991,	14992,	14993,	14994,	14995,	15136,	15137,	15138,	15139,	15140,	15141,	15142,	15143,	15152,	15216,	15265,	15277,	15387,	15388,	15483,	15546,	15550,	15587,	15590,	15623,	15641,	15653,	15711,	15712,	15730,	15743,	15763,	15794,	15805,	15821,	15831,	15884,	15932,	16039,	16122,	16124,	16153,	16175,	16181,	16220,	16233,	16264,	16277,	16306,	16361,	16377,	16391,	16392,	16393,	16402,	16404,	16431,	16439,	16440,	16444,	16447,	16448,	16455,	16457,	16463,	16468,	16513,	16524,	16528,	16551,	16569,	16594,	16596,	16600,	16610,	16647,	16648,	16718,	16731,	16763,	16765,	16794,	16795,	16899,	16948,	16962,	16993,	16998,	17011,	17013,	17034,	17061,	17062,	17141,	17142,	17143,	17144,	17155,	17158,	17248,	17262,	17263,	17264,	17265,	17266,	17333,	17334,	17335,	17395,	17396,	17398,	17400,	17401,	17405,	17410,	17412,	17417,	17420,	17431,	17547,	17584,	17585,	17587,	17599,	17674,	17676,	17677,	17679,	17711,	17719,	17749,	17750,	17751,	17752,	17753,	17754,	17756,	17757,	17811,	17812,	17814,	17948,	17963,	17964,	17965,	17989,	17998,	18083,	18139,	18145,	18165,	18229,	18230,	18257,	18264,	18273,	18321,	18322,	18323,	18351,	18515,	18548,	18599,	18600,	18623,	18637,	18675,	18676,	18687,	18698,	18736,	18753,	18768,	18792,	18794,	18797,	18823,	18828,	18830,	18850,	18851,	18853,	18854,	18857,	18882,	18885,	18886,	18887,	18888,	18891,	18892,	18893,	18894,	18898,	18901,	18904,	18930,	18947,	18967,	18968,	18970,	18972,	18973,	18974,	18976,	18977,	18980,	18982,	18983,	18984,	18985,	18986,	18991,	19006,	19059,	19060,	19061,	19062,	19064,	19066,	19067,	19069,	19071,	19103,	19104,	19110,	19116,	19153,	19180,	19181,	19186,	19263,	19272,	19273,	19280,	19318,	19409,	19425,	19428,	19456,	19528,	19531,	19538,	19606,	19607,	19609,	19610,	19612,	19613,	19616,	19623,	19636,	19647,	19648,	19685,	19798,	19799,	19800,	19801,	19802,	19805,	19806,	19807,	19808,	19811,	19812,	19813,	19816,	19820,	19821,	19836,	19874,	19875,	19878,	19960,	19985,	20051,	20052,	20053,	20054,	20055,	20056,	20057,	20058,	20059,	20061,	20062,	20063,	20064,	20065,	20066,	20069,	20070,	20071,	20072,	20074,	20078,	20079,	20081,	20084,	20088,	20090,	20110,	20156,	20157,	20168,	20189,	20193,	20245,	20344,	20345,	20346,	20347,	20348,	20349,	20350,	20353,	20354,	20355,	20356,	20357,	20358,	20359,	20360,	20361,	20362,	20363,	20365,	20368,	20370,	20371,	20373,	20374,	20377,	20391,	20392,	20396,	20398,	20400,	20444,	20476,	20520,	20682,	20685,	20687,	20688,	20689,	20690,	20691,	20692,	20693,	20694,	20695,	20698,	20699,	20700,	20701,	20702,	20703,	20707,	20709,	20714,	20728,	20760,	20774,	20864,	20865,	20866,	20867,	20868,	20869,	20870,	20872,	20874,	20899,	20909,	20962,	21041,	21042,	21117,	21118,	21121,	21139,	21146,	21227,	21271,	21272,	21273,	21274,	21275,	21425,	21430,	21493,	21505,	21507,	21510,	21513,	21612,	21616,	21621,	21622,	21623,	21624,	21667,	21675,	21751,	21765,	21766,	21767,	21846,	21847,	21856,	21857,	21858,	21871,	21872,	21873,	21875,	21876,	21877,	21881,	21883,	21885,	21924,	21925,	21957,	21977,	21978,	21979,	21980,	21984,	21985,	21993,	21997,	21999,	22001,	22031,	22033,	22082,	22113,	22175,	22228,	22247,	22271,	22272,	22371,	22374,	22462,	22463,	22613,	22694,	22695,	22696,	22697,	22700,	22880,	22881,	22882,	22883,	22884,	22901,	22977,	22978,	22979,	22981,	23030,	23032,	23191,	23230,	23236,	23238,	23291,	23340,	23453,	23552,	23553,	23744,	23761,	23774,	24016,	24025,	24037,	24085,	24090,	24096,	24125,	24126,	24128,	24129,	24130,	24132,	24133,	24140,	24141,	24142,	24145,	24150,	24151,	24152,	24153,	24155,	24168,	24169,	24170,	24171,	24172,	24173,	24174,	24181,	24186,	24187,	24189,	24190,	24192,	24193,	24206,	24207,	24208,	24209,	24210,	24211,	24212,	24213,	24214,	24239,	24243,	24244,	24246,	24247,	24249,	24250,	24251,	24252,	24253,	24254,	24255,	24256,	24257,	24258,	24261,	24290,	24297,	24298,	24299,	24300,	24301,	24302,	24303,	24304,	24305,	24307,	24308,	24315,	24326,	24330,	24334,	24335,	24336,	24350,	24364,	24365,	24366,	24367,	24368,	24371,	24372,	24390,	24391,	24393,	24405,	24406,	24408,	24411,	24412,	24413,	24415,	24438,	24439,	24440,	24473,	24474,	24476,	24477,	24478,	24479,	24480,	24481,	24483,	24484,	24485,	24486,	24487,	24520,	24522,	24523,	24524,	24525,	24526,	24527,	24528,	24529,	24530,	24531,	24532,	24533,	24535,	24536,	24537,	24540,	24541,	24542,	24543,	24544,	24545,	24546,	24547,	24549,	24550,	24576,	24586,	24621,	24622,	24623,	24624,	24625,	24626,	24627,	24628,	24629,	24630,	24631,	24632,	24633,	24634,	24635,	24636,	24637,	24638,	24639,	24640,	24641,	24642,	24644,	24645,	24646,	24647,	24648,	24651,	24652,	24653,	24654,	24655,	24656,	24657,	24712,	24713,	24714,	24715,	24716,	24717,	24719,	24720,	24721,	24722,	24723,	24724,	24731,	24775,	24795,	24812,	24831,	24833,	24835,	24836,	24845,	24846,	24851,	24869,	24877,	24888,	24889,	24907,	24926,	24952,	25091,	25169,	25177,	25178,	25195,	25206,	25247,	25248,	25251,	25267,	25340,	25345,	25455,	25456,	25460,	25464,	25754,	25822,	25845,	25865,	25890,	25891,	25893,	25914,	25975,	25976,	25978,	25980,	25982,	25986,	25996,	26003,	26074,	26112,	26143,	26172,	26182,	26183,	26186,	26194,	26202,	26283,	26284,	26287,	26289,	26293,	26303,	26316,	26320,	26322,	26463,	26465,	26467,	26469,	26476,	26481,	26486,	26489,	26497,	26596,	26663,	26678,	26717,	27136,	27183,	27307,	27340,	27341,	27342,	27344,	27348,	27355,	27607,	27608,	27609,	27610,	27623,	27635,	27641,	27922,	27937,	28165,	28263,	28277,	28422,	28433,	28437,	28508,	28738,	28739,	28740,	28743,	28748,	28820,	28990,	28993,	28997,	29008,	29009,	29010,	29011,	29079,	29084,	29090,	29093,	29101,	29102,	29104,	29105,	29106,	29112,	29113,	29114,	29119,	29120,	29122,	29123,	29124,	29125,	29129,	29130,	29133,	29134,	29135,	29137,	29139,	29146,	29147,	29172,	29174,	29176,	29184,	29191,	29192,	29194,	29200,	29201,	29203,	29221,	29224,	29225,	29226,	29232,	29234,	29258,	29265,	29268,	29273,	29274,	29275,	29276,	29277,	29278,	29280,	29281,	29282,	29300,	29301,	29302,	29310,	29313,	29314,	29315,	29316,	29320,	29382,	29435,	29436,	29454,	29457,	29458,	29468,	29469,	29470,	29473,	29475,	29476,	29477,	29481,	29482,	29483,	29485,	29500,	29501,	29503,	29504,	29505,	29508,	29513,	29515,	29524,	29532,	29533,	29534,	29535,	29537,	29549,	29553,	29556,	29561,	29574,	29618,	29634,	29635,	29637,	29639,	29665,	29666,	29668,	29669,	29672,	29682,	29693,	29709,	29710,	29711,	29717,	29741,	29742,	29746,	29747,	29752,	29753,	29755,	29756,	29759,	29804,	29805,	29832,	29998,	30003,	30005,	30006,	30007,	30009,	30019,	30025,	30040,	30074,	30075,	30077,	30078,	30080,	30082,	30083,	30084,	30290,	30291,	30293,	30349,	30350,	30351,	30352,	30353,	30354,	30358,	30376,	30392,	30424,	30426,	30589,	30590,	30591,	30613,	30614,	30615,	30616,	30617,	30619,	30627,	30628,	30647,	30954,	30958,	30985,	30986,	31316,	31317,	31331,	31334,	31336,	31357,	31358,	31359,	31360,	31497,	31501,	31502,	31503,	31504,	31526,	31527,	31528,	31882,	31883,	31884,	31890,	31891,	31892,	31893,	31894,	31929,	31966,	31970,	32153,	32498,	32520,	32583,	32618,	32683,	32769,	32780,	32788,	32847,	32848,	32857,	32872,	33058,	33148,	33153,	33255,	33275,	33279,	33300,	33513,	33519,	33520,	33521,	33522,	33524,	33525,	33527,	33528,	33534,	33578,	33579,	33580,	33581,	33582,	33584,	33585,	33586,	33587,	33589,	33591,	33593,	33594,	33599,	33600,	33602,	33619,	33634,	33655,	33753,	33845,	33846,	33866,	33868,	33869,	33871,	33873,	33883,	33888,	33890,	33891,	33907,	33926,	33931,	33933,	33934,	33936,	33972,	33973,	33978,	33987,	33988,	33989,	33990,	33991,	33992,	33993,	33997,	33998,	34000,	34001,	34007,	34015,	34050,	34058,	34081,	34082,	34085,	34086,	34089,	34091,	34092,	34095,	34260,	34265,	34293,	34294,	34295,	34296,	34297,	34309,	34315,	34316,	34320,	34346,	34399,	34419,	34461,	34462,	34463,	34464,	34465,	34469,	34503,	34527,	34590,	34816,	34827,	34845,	34846,	34849,	34852,	34853,	34863,	34941,	34971,	35015,	35020,	35134,	35136,	35144,	35156,	35206,	35221,	35264,	35285,	35292,	35294,	35295,	35296,	35299,	35300,	35301,	35309,	35311,	35315,	35321,	35323,	35324,	35328,	35329,	35330,	35331,	35332,	35342,	35343,	35347,	35351,	35356,	35357,	35386,	35415,	35428,	35440,	35459,	35467,	35471,	35474,	35529,	35562,	35575,	35634,	35637,	35646,	35655,	35663,	35691,	35704,	35732,	35733,	35744,	35835,	35853,	35881,	35884,	35887,	35889,	35893,	35894,	35896,	35897,	35898,	35899,	35900,	35901,	35902,	35907,	35909,	35910,	35917,	35918,	35920,	35921,	35923,	35926,	35928,	35929,	35930,	35939,	35941,	35943,	35944,	35948,	35949,	35950,	35951,	35953,	35954,	35957,	35979,	35997,	35998,	36000,	36018,	36021,	36023,	36089,	36093,	36098,	36099,	36102,	36105,	36111,	36136,	36154,	36172,	36173,	36175,	36193,	36200,	36210,	36223,	36225,	36226,	36229,	36230,	36233,	36239,	36240,	36241,	36242,	36244,	36246,	36247,	36248,	36249,	36258,	36264,	36267,	36269,	36370,	36433,	36437,	36469,	36479,	36480,	36481,	36504,	36515,	36520,	36521,	36529,	36530,	36550,	36584,	36599,	36600,	36608,	36614,	36666,	36674,	36685,	36707,	36717,	36736,	36743,	36756,	36760,	36775,	36784,	36785,	36787,	36804,	36830,	36843,	36844,	36850,	36854,	36860,	36870,	36874,	36875,	36876,	36877,	36879,	36952,	36958,	36979,	36980,	36991,	36996,	37050,	37051,	37058,	37092,	37093,	37111,	37117,	37120,	37123,	37137,	37142,	37147,	37148,	37149,	37150,	37151,	37152,	37170,	37176,	37187,	37190,	37192,	37193,	37198,	37201,	37205,	37209,	37217,	37221,	37226,	37227,	37231,	37242,	37244,	37255,	37266,	37319,	37324,	37352,	37365,	37375,	37415,	37429,	37448,	37450,	37452,	37495,	37518,	37519,	37569,	37570,	37572,	37573,	37576,	37597,	37608,	37627,	37676,	37677,	37735,	37743,	37748,	37749,	37750,	37751,	37756,	37758,	37766,	37767,	37792,	37801,	37805,	37807,	37808,	37812,	37828,	37834,	37835,	37838,	37840,	37841,	37842,	37843,	37844,	37845,	37846,	37849,	37850,	37852,	37854,	37863,	37866,	37873,	37877,	37880,	37881,	37883,	37897,	37900,	37908,	37927,	37996,	38008,	38081,	38085,	38091,	38092,	38161,	38183,	38187,	38195,	38200,	38282,	38292,	38300,	38302,	38303,	38309,	38314,	38316,	38317,	38321,	38360,	38368,	38374,	38382,	38398,	38399,	38402,	38403,	38410,	38411,	38420,	38429,	38431,	38439,	38452,	38464,	38467,	38483,	38499,	38500,	38514,	38515,	38530,	38533,	38547,	38548,	38556,	38558,	38559,	38560,	38561,	38563,	38564,	38565,	38566,	38567,	38568,	38569,	38571,	38574,	38575,	38578,	38619,	38635);'
    sql = 'select id from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 0 and id in (12,	70,	74,	77,	92,	108,	110,	111,	113,	127,	128,	129,	133,	136,	145,	149,	151,	189,	210,	223,	238,	247,	253,	276,	287,	289,	291,	292,	303,	308,	345,	346,	347,	349,	350,	351,	354,	355,	359,	360,	361,	362,	363,	364,	365,	368,	377,	381,	389,	393,	395,	406,	414,	424,	439,	446,	448,	549,	554,	558,	574,	577,	578,	579,	581,	582,	583,	585,	588,	589,	591,	592,	595,	597,	601,	604,	605,	609,	613,	621,	625,	682,	684,	712,	713,	714,	715,	716,	717,	719,	722,	723,	724,	726,	730,	731,	732,	734,	735,	738,	739,	740,	743,	749,	751,	752,	753,	754,	755,	758,	765,	782,	787,	816,	822,	830,	836,	851,	857,	860,	861,	869,	882,	970,	1044,	1045,	1047,	1050,	1052,	1055,	1056,	1057,	1058,	1060,	1061,	1062,	1063,	1064,	1065,	1066,	1068,	1069,	1072,	1073,	1074,	1075,	1076,	1079,	1080,	1083,	1084,	1086,	1087,	1089,	1094,	1100,	1104,	1105,	1106,	1115,	1116,	1117,	1122,	1124,	1125,	1126,	1131,	1133,	1142,	1143,	1146,	1150,	1151,	1172,	1174,	1176,	1184,	1194,	1248,	1283,	1301,	1307,	1309,	1367,	1381,	1417,	1419,	1452,	1456,	1482,	1491,	1507,	1511,	1513,	1522,	1542,	1562,	1585,	1587,	1591,	1624,	1626,	1628,	1652,	1687,	1688,	1689,	1692,	1693,	1694,	1696,	1698,	1699,	1701,	1704,	1710,	1711,	1714,	1716,	1719,	1720,	1727,	1728,	1730,	1745,	1750,	1751,	1755,	1757,	1770,	1809,	1815,	1820,	1831,	1835,	1872,	1884,	1887,	1898,	1935,	1955,	1993,	2009,	2025,	2026,	2029,	2030,	2031,	2199,	2241,	2244,	2246,	2275,	2276,	2277,	2278,	2279,	2305,	2323,	2324,	2325,	2327,	2328,	2347,	2360,	2402,	2404,	2410,	2415,	2442,	2448,	2450,	2451,	2452,	2461,	2462,	2467,	2477,	2509,	2510,	2512,	2513,	2518,	2522,	2524,	2531,	2543,	2547,	2554,	2555,	2576,	2577,	2578,	2579,	2580,	2583,	2586,	2605,	2609,	2624,	2629,	2646,	2651,	2652,	2653,	2655,	2656,	2659,	2661,	2662,	2671,	2676,	2677,	2756,	2757,	2758,	2760,	2761,	2762,	2768,	2771,	2772,	2773,	2774,	2776,	2777,	2781,	2782,	2783,	2786,	2789,	2790,	2791,	2792,	2793,	2794,	2795,	2798,	2811,	2815,	2822,	2869,	2884,	2907,	2913,	2920,	2924,	3029,	3127,	3141,	3146,	3172,	3173,	3174,	3175,	3176,	3177,	3178,	3180,	3182,	3183,	3184,	3185,	3189,	3192,	3194,	3198,	3199,	3202,	3203,	3207,	3208,	3211,	3223,	3224,	3230,	3236,	3252,	3253,	3262,	3275,	3302,	3305,	3316,	3365,	3388,	3389,	3391,	3392,	3396,	3397,	3398,	3399,	3400,	3401,	3402,	3405,	3406,	3408,	3409,	3412,	3415,	3416,	3418,	3419,	3420,	3421,	3422,	3423,	3424,	3425,	3426,	3427,	3428,	3431,	3432,	3433,	3436,	3438,	3439,	3443,	3444,	3445,	3446,	3450,	3452,	3455,	3456,	3458,	3461,	3466,	3467,	3470,	3503,	3526,	3532,	3536,	3538,	3541,	3542,	3543,	3549,	3563,	3573,	3597,	3598,	3620,	3626,	3662,	3819,	3921,	3922,	3923,	3925,	3927,	3931,	3932,	3933,	3934,	3935,	3936,	3937,	3938,	3939,	3940,	3942,	3943,	3944,	3945,	3948,	3950,	3952,	3953,	3954,	3955,	3957,	3958,	3959,	3960,	3961,	3963,	3965,	3966,	3967,	3968,	3971,	3972,	3980,	3988,	3995,	4000,	4005,	4011,	4039,	4043,	4046,	4048,	4050,	4059,	4077,	4086,	4089,	4098,	4101,	4104,	4109,	4111,	4123,	4127,	4170,	4184,	4203,	4215,	4221,	4235,	4287,	4295,	4345,	4362,	4367,	4448,	4449,	4451,	4452,	4453,	4454,	4455,	4457,	4458,	4459,	4460,	4461,	4462,	4463,	4464,	4465,	4466,	4467,	4468,	4469,	4470,	4472,	4478,	4480,	4481,	4482,	4496,	4500,	4504,	4508,	4513,	4518,	4523,	4524,	4548,	4551,	4567,	4572,	4598,	4607,	4608,	4611,	4657,	4786,	4788,	4789,	4791,	4792,	4793,	4794,	4795,	4796,	4797,	4798,	4799,	4804,	4805,	4811,	4815,	4817,	4819,	4829,	4839,	4840,	5037,	5038,	5040,	5047,	5179,	5191,	5192,	5210,	5248,	5249,	5266,	5275,	5276,	5322,	5323,	5327,	5330,	5362,	5410,	5411,	5416,	5451,	5462,	5493,	5494,	5496,	5519,	5536,	5548,	5555,	5587,	5588,	5589,	5590,	5591,	5594,	5599,	5617,	5633,	5636,	5660,	5667,	5695,	5697,	5701,	5702,	5706,	5767,	5768,	5769,	5773,	5778,	5786,	5831,	5832,	5833,	5835,	5836,	5837,	5839,	5844,	5849,	5850,	5858,	5860,	5889,	5901,	5915,	5916,	5918,	5920,	5991,	5992,	5993,	5994,	5995,	6009,	6045,	6079,	6080,	6081,	6083,	6084,	6085,	6086,	6087,	6100,	6101,	6107,	6185,	6249,	6278,	6279,	6280,	6281,	6282,	6283,	6285,	6305,	6306,	6387,	6393,	6396,	6397,	6398,	6411,	6439,	6498,	6505,	6511,	6513,	6518,	6520,	6524,	6525,	6526,	6527,	6532,	6543,	6553,	6555,	6565,	6566,	6569,	6573,	6574,	6581,	6585,	6601,	6605,	6606,	6612,	6615,	6617,	6621,	6645,	6646,	6648,	6651,	6652,	6658,	6660,	6667,	6672,	6676,	6682,	6684,	6688,	6690,	6692,	6693,	6700,	6704,	6743,	6769,	6771,	6772,	6775,	6778,	6783,	6785,	6789,	6793,	6818,	6824,	6829,	6830,	6834,	6839,	6845,	6846,	6849,	6850,	6855,	6859,	6866,	6873,	6878,	6887,	6888,	6889,	6890,	6907,	6926,	6945,	6948,	6954,	6963,	7006,	7066,	7082,	7102,	7121,	7162,	7163,	7271,	7272,	7273,	7285,	7314,	7315,	7350,	7362,	7364,	7398,	7441,	7442,	7443,	7444,	7446,	7451,	7454,	7456,	7462,	7464,	7504,	7515,	7516,	7547,	7548,	7634,	7659,	7660,	7661,	7662,	7663,	7664,	7665,	7672,	7776,	7777,	7783,	7784,	7788,	7789,	7792,	7795,	7797,	7798,	7799,	7809,	7831,	7889,	7917,	7918,	7920,	7926,	7930,	7932,	7933,	7935,	7936,	7941,	7944,	7960,	7962,	7971,	8008,	8017,	8070,	8075,	8076,	8110,	8111,	8112,	8117,	8120,	8128,	8129,	8130,	8133,	8136,	8140,	8143,	8144,	8145,	8148,	8149,	8150,	8153,	8154,	8159,	8163,	8203,	8225,	8268,	8270,	8302,	8310,	8312,	8419,	8421,	8496,	8497,	8498,	8500,	8505,	8506,	8507,	8508,	8510,	8513,	8517,	8533,	8543,	8584,	8710,	8717,	8718,	8719,	8720,	8721,	8722,	8724,	8726,	8730,	8732,	8733,	8734,	8737,	8739,	8740,	8741,	8742,	8743,	8744,	8745,	8747,	8748,	8750,	8751,	8752,	8753,	8754,	8755,	8756,	8757,	8759,	8761,	8764,	8766,	8768,	8769,	8773,	8774,	8775,	8784,	8811,	8817,	9042,	9056,	9207,	9219,	9240,	9249,	9273,	9318,	9322,	9422,	9457,	9485,	9562,	9623,	9647,	9836,	9837,	9922,	10067,	10068,	10069,	10168,	10185,	10288,	10400,	10401,	10513,	10515,	10606,	10700,	10702,	10703,	10771,	10772,	10819,	10821,	10927,	11019,	11056,	11113,	11142,	11143,	11225,	11226,	11227,	11343,	11361,	11362,	11364,	11377,	11448,	11460,	11461,	11462,	11463,	11465,	11466,	11468,	11493,	11609,	11610,	11611,	11617,	11638,	11659,	11718,	11748,	11749,	11750,	11751,	11762,	11821,	11850,	11891,	11898,	11911,	11913,	11914,	11915,	11916,	11917,	11918,	11919,	11920,	11921,	11922,	11923,	11926,	11928,	11934,	11955,	11980,	12026,	12030,	12044,	12092,	12093,	12094,	12095,	12096,	12098,	12100,	12101,	12102,	12103,	12104,	12105,	12106,	12107,	12108,	12109,	12110,	12111,	12112,	12113,	12114,	12122,	12123,	12125,	12144,	12147,	12234,	12235,	12237,	12256,	12305,	12339,	12346,	12407,	12448,	12511,	12665,	12705,	12706,	12708,	12709,	12710,	12711,	12712,	12713,	12714,	12716,	12717,	12718,	12719,	12720,	12721,	12722,	12725,	12729,	12742,	12753,	12762,	12802,	12813,	12816,	12821,	12823,	12843,	12856,	12905,	12907,	13006,	13061,	13062,	13063,	13137,	13138,	13198,	13329,	13330,	13331,	13332,	13494,	13495,	13582,	13583,	13584,	13585,	13586,	13697,	13833,	13834,	13835,	13836,	13837,	13840,	14160,	14161,	14200,	14341,	14342,	14343,	14590,	14591,	14597,	14610,	14614,	14631,	14632,	14633,	14634,	14635,	14650,	14655,	14656,	14689,	14726,	14777,	14870,	14871,	14872,	14921,	14922,	14923,	14991,	14992,	14993,	14994,	14995,	15136,	15137,	15138,	15139,	15140,	15141,	15142,	15143,	15152,	15216,	15265,	15277,	15387,	15388,	15483,	15546,	15550,	15587,	15590,	15623,	15641,	15653,	15711,	15712,	15730,	15743,	15763,	15794,	15805,	15821,	15831,	15884,	15932,	16039,	16122,	16124,	16153,	16175,	16181,	16220,	16233,	16264,	16277,	16306,	16361,	16377,	16391,	16392,	16393,	16402,	16404,	16431,	16439,	16440,	16444,	16447,	16448,	16455,	16457,	16463,	16468,	16513,	16524,	16528,	16551,	16569,	16594,	16596,	16600,	16610,	16647,	16648,	16718,	16731,	16763,	16765,	16794,	16795,	16899,	16948,	16962,	16993,	16998,	17011,	17013,	17034,	17061,	17062,	17141,	17142,	17143,	17144,	17155,	17158,	17248,	17262,	17263,	17264,	17265,	17266,	17333,	17334,	17335,	17395,	17396,	17398,	17400,	17401,	17405,	17410,	17412,	17417,	17420,	17431,	17547,	17584,	17585,	17587,	17599,	17674,	17676,	17677,	17679,	17711,	17719,	17749,	17750,	17751,	17752,	17753,	17754,	17756,	17757,	17811,	17812,	17814,	17948,	17963,	17964,	17965,	17989,	17998,	18083,	18139,	18145,	18165,	18229,	18230,	18257,	18264,	18273,	18321,	18322,	18323,	18351,	18515,	18548,	18599,	18600,	18623,	18637,	18675,	18676,	18687,	18698,	18736,	18753,	18768,	18792,	18794,	18797,	18823,	18828,	18830,	18850,	18851,	18853,	18854,	18857,	18882,	18885,	18886,	18887,	18888,	18891,	18892,	18893,	18894,	18898,	18901,	18904,	18930,	18947,	18967,	18968,	18970,	18972,	18973,	18974,	18976,	18977,	18980,	18982,	18983,	18984,	18985,	18986,	18991,	19006,	19059,	19060,	19061,	19062,	19064,	19066,	19067,	19069,	19071,	19103,	19104,	19110,	19116,	19153,	19180,	19181,	19186,	19263,	19272,	19273,	19280,	19318,	19409,	19425,	19428,	19456,	19528,	19531,	19538,	19606,	19607,	19609,	19610,	19612,	19613,	19616,	19623,	19636,	19647,	19648,	19685,	19798,	19799,	19800,	19801,	19802,	19805,	19806,	19807,	19808,	19811,	19812,	19813,	19816,	19820,	19821,	19836,	19874,	19875,	19878,	19960,	19985,	20051,	20052,	20053,	20054,	20055,	20056,	20057,	20058,	20059,	20061,	20062,	20063,	20064,	20065,	20066,	20069,	20070,	20071,	20072,	20074,	20078,	20079,	20081,	20084,	20088,	20090,	20110,	20156,	20157,	20168,	20189,	20193,	20245,	20344,	20345,	20346,	20347,	20348,	20349,	20350,	20353,	20354,	20355,	20356,	20357,	20358,	20359,	20360,	20361,	20362,	20363,	20365,	20368,	20370,	20371,	20373,	20374,	20377,	20391,	20392,	20396,	20398,	20400,	20444,	20476,	20520,	20682,	20685,	20687,	20688,	20689,	20690,	20691,	20692,	20693,	20694,	20695,	20698,	20699,	20700,	20701,	20702,	20703,	20707,	20709,	20714,	20728,	20760,	20774,	20864,	20865,	20866,	20867,	20868,	20869,	20870,	20872,	20874,	20899,	20909,	20962,	21041,	21042,	21117,	21118,	21121,	21139,	21146,	21227,	21271,	21272,	21273,	21274,	21275,	21425,	21430,	21493,	21505,	21507,	21510,	21513,	21612,	21616,	21621,	21622,	21623,	21624,	21667,	21675,	21751,	21765,	21766,	21767,	21846,	21847,	21856,	21857,	21858,	21871,	21872,	21873,	21875,	21876,	21877,	21881,	21883,	21885,	21924,	21925,	21957,	21977,	21978,	21979,	21980,	21984,	21985,	21993,	21997,	21999,	22001,	22031,	22033,	22082,	22113,	22175,	22228,	22247,	22271,	22272,	22371,	22374,	22462,	22463,	22613,	22694,	22695,	22696,	22697,	22700,	22880,	22881,	22882,	22883,	22884,	22901,	22977,	22978,	22979,	22981,	23030,	23032,	23191,	23230,	23236,	23238,	23291,	23340,	23453,	23552,	23553,	23744,	23761,	23774,	24016,	24025,	24037,	24085,	24090,	24096,	24125,	24126,	24128,	24129,	24130,	24132,	24133,	24140,	24141,	24142,	24145,	24150,	24151,	24152,	24153,	24155,	24168,	24169,	24170,	24171,	24172,	24173,	24174,	24181,	24186,	24187,	24189,	24190,	24192,	24193,	24206,	24207,	24208,	24209,	24210,	24211,	24212,	24213,	24214,	24239,	24243,	24244,	24246,	24247,	24249,	24250,	24251,	24252,	24253,	24254,	24255,	24256,	24257,	24258,	24261,	24290,	24297,	24298,	24299,	24300,	24301,	24302,	24303,	24304,	24305,	24307,	24308,	24315,	24326,	24330,	24334,	24335,	24336,	24350,	24364,	24365,	24366,	24367,	24368,	24371,	24372,	24390,	24391,	24393,	24405,	24406,	24408,	24411,	24412,	24413,	24415,	24438,	24439,	24440,	24473,	24474,	24476,	24477,	24478,	24479,	24480,	24481,	24483,	24484,	24485,	24486,	24487,	24520,	24522,	24523,	24524,	24525,	24526,	24527,	24528,	24529,	24530,	24531,	24532,	24533,	24535,	24536,	24537,	24540,	24541,	24542,	24543,	24544,	24545,	24546,	24547,	24549,	24550,	24576,	24586,	24621,	24622,	24623,	24624,	24625,	24626,	24627,	24628,	24629,	24630,	24631,	24632,	24633,	24634,	24635,	24636,	24637,	24638,	24639,	24640,	24641,	24642,	24644,	24645,	24646,	24647,	24648,	24651,	24652,	24653,	24654,	24655,	24656,	24657,	24712,	24713,	24714,	24715,	24716,	24717,	24719,	24720,	24721,	24722,	24723,	24724,	24731,	24775,	24795,	24812,	24831,	24833,	24835,	24836,	24845,	24846,	24851,	24869,	24877,	24888,	24889,	24907,	24926,	24952,	25091,	25169,	25177,	25178,	25195,	25206,	25247,	25248,	25251,	25267,	25340,	25345,	25455,	25456,	25460,	25464,	25754,	25822,	25845,	25865,	25890,	25891,	25893,	25914,	25975,	25976,	25978,	25980,	25982,	25986,	25996,	26003,	26074,	26112,	26143,	26172,	26182,	26183,	26186,	26194,	26202,	26283,	26284,	26287,	26289,	26293,	26303,	26316,	26320,	26322,	26463,	26465,	26467,	26469,	26476,	26481,	26486,	26489,	26497,	26596,	26663,	26678,	26717,	27136,	27183,	27307,	27340,	27341,	27342,	27344,	27348,	27355,	27607,	27608,	27609,	27610,	27623,	27635,	27641,	27922,	27937,	28165,	28263,	28277,	28422,	28433,	28437,	28508,	28738,	28739,	28740,	28743,	28748,	28820,	28990,	28993,	28997,	29008,	29009,	29010,	29011,	29079,	29084,	29090,	29093,	29101,	29102,	29104,	29105,	29106,	29112,	29113,	29114,	29119,	29120,	29122,	29123,	29124,	29125,	29129,	29130,	29133,	29134,	29135,	29137,	29139,	29146,	29147,	29172,	29174,	29176,	29184,	29191,	29192,	29194,	29200,	29201,	29203,	29221,	29224,	29225,	29226,	29232,	29234,	29258,	29265,	29268,	29273,	29274,	29275,	29276,	29277,	29278,	29280,	29281,	29282,	29300,	29301,	29302,	29310,	29313,	29314,	29315,	29316,	29320,	29382,	29435,	29436,	29454,	29457,	29458,	29468,	29469,	29470,	29473,	29475,	29476,	29477,	29481,	29482,	29483,	29485,	29500,	29501,	29503,	29504,	29505,	29508,	29513,	29515,	29524,	29532,	29533,	29534,	29535,	29537,	29549,	29553,	29556,	29561,	29574,	29618,	29634,	29635,	29637,	29639,	29665,	29666,	29668,	29669,	29672,	29682,	29693,	29709,	29710,	29711,	29717,	29741,	29742,	29746,	29747,	29752,	29753,	29755,	29756,	29759,	29804,	29805,	29832,	29998,	30003,	30005,	30006,	30007,	30009,	30019,	30025,	30040,	30074,	30075,	30077,	30078,	30080,	30082,	30083,	30084,	30290,	30291,	30293,	30349,	30350,	30351,	30352,	30353,	30354,	30358,	30376,	30392,	30424,	30426,	30589,	30590,	30591,	30613,	30614,	30615,	30616,	30617,	30619,	30627,	30628,	30647,	30954,	30958,	30985,	30986,	31316,	31317,	31331,	31334,	31336,	31357,	31358,	31359,	31360,	31497,	31501,	31502,	31503,	31504,	31526,	31527,	31528,	31882,	31883,	31884,	31890,	31891,	31892,	31893,	31894,	31929,	31966,	31970,	32153,	32498,	32520,	32583,	32618,	32683,	32769,	32780,	32788,	32847,	32848,	32857,	32872,	33058,	33148,	33153,	33255,	33275,	33279,	33300,	33513,	33519,	33520,	33521,	33522,	33524,	33525,	33527,	33528,	33534,	33578,	33579,	33580,	33581,	33582,	33584,	33585,	33586,	33587,	33589,	33591,	33593,	33594,	33599,	33600,	33602,	33619,	33634,	33655,	33753,	33845,	33846,	33866,	33868,	33869,	33871,	33873,	33883,	33888,	33890,	33891,	33907,	33926,	33931,	33933,	33934,	33936,	33972,	33973,	33978,	33987,	33988,	33989,	33990,	33991,	33992,	33993,	33997,	33998,	34000,	34001,	34007,	34015,	34050,	34058,	34081,	34082,	34085,	34086,	34089,	34091,	34092,	34095,	34260,	34265,	34293,	34294,	34295,	34296,	34297,	34309,	34315,	34316,	34320,	34346,	34399,	34419,	34461,	34462,	34463,	34464,	34465,	34469,	34503,	34527,	34590,	34816,	34827,	34845,	34846,	34849,	34852,	34853,	34863,	34941,	34971,	35015,	35020,	35134,	35136,	35144,	35156,	35206,	35221,	35264,	35285,	35292,	35294,	35295,	35296,	35299,	35300,	35301,	35309,	35311,	35315,	35321,	35323,	35324,	35328,	35329,	35330,	35331,	35332,	35342,	35343,	35347,	35351,	35356,	35357,	35386,	35415,	35428,	35440,	35459,	35467,	35471,	35474,	35529,	35562,	35575,	35634,	35637,	35646,	35655,	35663,	35691,	35704,	35732,	35733,	35744,	35835,	35853,	35881,	35884,	35887,	35889,	35893,	35894,	35896,	35897,	35898,	35899,	35900,	35901,	35902,	35907,	35909,	35910,	35917,	35918,	35920,	35921,	35923,	35926,	35928,	35929,	35930,	35939,	35941,	35943,	35944,	35948,	35949,	35950,	35951,	35953,	35954,	35957,	35979,	35997,	35998,	36000,	36018,	36021,	36023,	36089,	36093,	36098,	36099,	36102,	36105,	36111,	36136,	36154,	36172,	36173,	36175,	36193,	36200,	36210,	36223,	36225,	36226,	36229,	36230,	36233,	36239,	36240,	36241,	36242,	36244,	36246,	36247,	36248,	36249,	36258,	36264,	36267,	36269,	36370,	36433,	36437,	36469,	36479,	36480,	36481,	36504,	36515,	36520,	36521,	36529,	36530,	36550,	36584,	36599,	36600,	36608,	36614,	36666,	36674,	36685,	36707,	36717,	36736,	36743,	36756,	36760,	36775,	36784,	36785,	36787,	36804,	36830,	36843,	36844,	36850,	36854,	36860,	36870,	36874,	36875,	36876,	36877,	36879,	36952,	36958,	36979,	36980,	36991,	36996,	37050,	37051,	37058,	37092,	37093,	37111,	37117,	37120,	37123,	37137,	37142,	37147,	37148,	37149,	37150,	37151,	37152,	37170,	37176,	37187,	37190,	37192,	37193,	37198,	37201,	37205,	37209,	37217,	37221,	37226,	37227,	37231,	37242,	37244,	37255,	37266,	37319,	37324,	37352,	37365,	37375,	37415,	37429,	37448,	37450,	37452,	37495,	37518,	37519,	37569,	37570,	37572,	37573,	37576,	37597,	37608,	37627,	37676,	37677,	37735,	37743,	37748,	37749,	37750,	37751,	37756,	37758,	37766,	37767,	37792,	37801,	37805,	37807,	37808,	37812,	37828,	37834,	37835,	37838,	37840,	37841,	37842,	37843,	37844,	37845,	37846,	37849,	37850,	37852,	37854,	37863,	37866,	37873,	37877,	37880,	37881,	37883,	37897,	37900,	37908,	37927,	37996,	38008,	38081,	38085,	38091,	38092,	38161,	38183,	38187,	38195,	38200,	38282,	38292,	38300,	38302,	38303,	38309,	38314,	38316,	38317,	38321,	38360,	38368,	38374,	38382,	38398,	38399,	38402,	38403,	38410,	38411,	38420,	38429,	38431,	38439,	38452,	38464,	38467,	38483,	38499,	38500,	38514,	38515,	38530,	38533,	38547,	38548,	38556,	38558,	38559,	38560,	38561,	38563,	38564,	38565,	38566,	38567,	38568,	38569,	38571,	38574,	38575,	38578,	38619,	38635);'
    print(sql)
    papers = pd.read_sql(sql, con=db)

    for index, row in papers.iterrows():

        lang = None
        id = row[0]
        english = 0
        other = 0
        text = ""
        res = ""
        print(id)
        if id:

            # with open(os.path.join('data/txt', str(id) + '.txt')) as infile:
            with open(
                    os.path.join(
                        '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/txt',
                        str(id) + '.txt')) as infile:
                for line in infile:
                    if not re.match(r'^\s*$', line):
                        line = re.sub(r"-\n", "", line)
                        line = re.sub(r"\n", " ", line)
                        text += line
                infile.close()
            lenText = len(text)

            nrequest = round(float(lenText) / 5000)
            count = 1
            while count <= nrequest:
                res = ''
                content = ""

                posIni = (count * 5000) - 5000
                posFin = (count * 5000) - 1

                content += text[posIni:posFin]
                try:
                    translator = Translator(random.choice(key_choices))
                    res = translator.detect_lang([content])

                except:
                    pass
                if res:
                    if res == 'en':
                        english += 1
                    else:
                        other += 1
                count += 1
            if english > other:
                lang = "English"
                sql = "update resolved_papers set english = 1 where id = %s" % (
                    id)
            else:
                lang = "Other"
            try:
                cur.execute(sql)
                db.commit()
            except:
                db.rollback()
        print("Id: %s. Language: %s" % (id, lang))
    print("Done!")
Ejemplo n.º 7
0
def _downloadIEEE():
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    sql = "SELECT p.id, p.main_link, p.direct_link FROM `resolved_papers` p inner join `resolved_papers_title` pt on pt.Id = p.Id where p.source like '%ieee%' and p.downloaded = 0 and pt.`title_language` = 'en';"
    papers = pd.read_sql(sql, con=db)

    for index, row in papers.iterrows():

        # id, main_link, direct_link = ids
        # direct_link = 'http://link.springer.com/article/10.1007/s10579-014-9282-3'
        id = row['id']
        main_link = row['main_link']

        # IEEE

        destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/'
        path = destination + str(id) + '.pdf'
        print(path)

        paper_id = (re.findall('\d+', main_link))[0]
        try:
            # path = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/4254.pdf'
            # paper_id = '7911954'

            url_pdf = 'wget "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s" -O %s' % (
                paper_id, path)
            os.system(url_pdf)

            # os.system('wget "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s" -O %s') % (str(paper_id), path)
            # url_pdf = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id)
            # print(url_pdf)

            # ua = str(get_random_ua())
            #
            # try:
            #     response = requests.get(
            #         url_pdf,
            #         headers={
            #             'User-Agent': ua
            #         }
            #     )
            # except:
            #     print("Connection refused")
            #     time.sleep(5)
            #
            #
            # print(response.status_code)
            # if response.status_code == 200:
            #
            #     content_type = response.headers.get('content-type')
            #
            #     if 'application/pdf' in str(content_type):
            #         destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/'
            #         path = destination + str(id) + '.pdf'
            #
            #         with open(path, 'wb') as f:
            #             f.write(response.content)
            #
            sql = "update resolved_papers set downloaded = 1 where id = %s" % (
                id)

            try:
                cur.execute(sql)
                db.commit()
                print("Id: %s. Downloaded: True. Saved!" % (id))
            except:
                db.rollback()

            # time.sleep(randint(1, 30))
        #
        #     else:
        #         print('Title with identifier %s not found'
        #               % (id))
        except:
            print(
                'Failed to fetch citeseerx page with identifier %s due to request exception.'
                % (id))

        time.sleep(randint(1, 6))
Ejemplo n.º 8
0
def _downloadSpringer(ids):
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        id, main_link, direct_link = ids
        # direct_link = 'http://link.springer.com/article/10.1007/s10579-014-9282-3'

        # SPRINGER #

        if 'article' in main_link:
            # https://link.springer.com/article/10.1007/s10579-014-9282-3
            # https://link.springer.com/content/pdf/10.1007%2Fs10579-014-9282-3.pdf
            url_pdf = main_link.replace('article', 'content/pdf') + '.pdf'

        elif 'chapter' in main_link:
            # http://link.springer.com/chapter/10.1007/978-3-319-09846-3_4/fulltext.html
            # https://link.springer.com/content/pdf/10.1007%2F978-3-319-09846-3.pdf
            # direct_link = main_link.replace('/fulltext.html', '')
            url_pdf = main_link.replace('chapter', 'content/pdf') + '.pdf'

        # # IEEE
        #
        # paper_id = (re.findall('\d+', main_link))[0]
        # url_pdf = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id)
        # print(url_pdf)

        ua = str(get_random_ua())

        try:
            response = requests.get(url_pdf, headers={'User-Agent': ua})
        except:
            print("Connection refused")
            time.sleep(5)

        print(response.status_code)
        if response.status_code == 200:

            content_type = response.headers.get('content-type')

            if 'application/pdf' in str(content_type):
                destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/'
                path = destination + str(id) + '.pdf'

                with open(path, 'wb') as f:
                    f.write(response.content)

                sql = "update resolved_papers set downloaded = 1 where id = %s" % (
                    id)

                try:
                    cur.execute(sql)
                    db.commit()
                    print("Id: %s. Downloaded: True. Saved!" % (id))
                except:
                    db.rollback()

            else:
                print('Title with identifier %s not found' % (id))
    except:
        print(
            'Failed to fetch citeseerx page with identifier %s due to request exception.'
            % (id))

    time.sleep(randint(1, 6))
Ejemplo n.º 9
0
def _download(ids):
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        id, query = ids

        params = urlencode({'q': query.lower()}, "UTF-8")

        url = SCHOLARS_BASE_URL + "/search?" + params

        print(url)

        ua = str(get_random_ua())

        try:
            response = requests.get(url, headers={'User-Agent': ua})
        except:
            print("Connection refused")
            time.sleep(5)

        print(response.status_code)
        if response.status_code == 200:

            data = response.text
            soup = BeautifulSoup(data, "html.parser")

            item = soup.find_all('div', {'class': 'result'})[0]

            if item:
                link = str(item.contents[1]).split('\n')
                title = ""
                title = re.sub('<[^<]+?>', '', link[2])

            if query.lower() == title.lower():

                # string = '/viewdoc/summary;jsessionid=4C1CD7E8F0D4A4E4BABAE601DE8D326F?doi=10.1.1.317.9673&rank=1'
                # suffix = re.sub(';.*\?', '?', string)
                # suffix = suffix.replace('summary', 'download').replace('&rank=1', '&rep=rep1&type=pdf')

                soup = BeautifulSoup(link[1])
                a = soup.find("a", class_="remove doc_details")
                string = a.attrs['href']

                suffix = re.sub(';.*\?', '?', string)
                suffix = suffix.replace('summary', 'download').replace(
                    '&rank=1', '&rep=rep1&type=pdf')

                url_pdf = SCHOLARS_BASE_URL + suffix
                print(url_pdf)

                res = requests.get(url_pdf)
                content_type = res.headers.get('content-type')

                if 'application/pdf' in str(content_type):
                    destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/'
                    path = destination + str(id) + '.pdf'

                    with open(path, 'wb') as f:
                        f.write(res.content)

                sql = "update resolved_papers set downloaded = 1 where id = %s" % (
                    id)

                try:
                    cur.execute(sql)
                    db.commit()
                    print("Id: %s. Downloaded: True. Saved!" % (id))
                except:
                    db.rollback()

            else:
                print('Title is not found with identifier %s' % (id))
    except:
        print(
            'Failed to fetch citeseerx page with identifier %s due to request exception.'
            % (id))

    time.sleep(randint(1, 6))
Ejemplo n.º 10
0
import pymysql

from base import db, cur

from os import listdir
from os.path import isfile, join

files = [f for f in listdir('data/manual') if isfile(join('data/manual', f))]

for file in files:
    name = file.split('.')[0]
    sql = "update resolved_papers set downloaded = 1 where id = %s" % (name)

    try:
        cur.execute(sql)
        db.commit()
        print("Id: %s. Updated!" % (name))
    except:
        db.rollback()
Ejemplo n.º 11
0
def downloadPDFIEEE(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        i, res_title, main_link, direct_link = ids


        p = False
        downloaded = "False"
        count = 0



        # to get the title language
        # sql = 'select title_language from resolved_papers_title where id = %s;' % (i)
        # cur.execute(sql)
        # res_title = cur.fetchall()[0][0]


        if res_title and res_title == "en":  # or not res_title:

            print(i)
            print(main_link)
            print(direct_link)

            # url = _getUrl(i, "direct_link")
            # toParse = direct_link
            # paper_id = (re.findall('\d+', toParse))[0]
            url = direct_link

            while downloaded == "False" and count < 2:

                count += 1
                if count == 2:
                    file = requests.get(url)
                    open('/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/%s.pdf', 'wb').write(file.content) % (i)
                    p = True
                else:
                    if url:
                        s = DownloadPDF()
                        p = s.download(url,
                                       destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/',
                                       path=str(i) + '.pdf')
                if p == True:
                    downloaded = "True"
                else:
                    # url = _getUrl(i, "main_link")
                    toParse = main_link
                    paper_id = (re.findall('\d+', toParse))[0]
                    # url = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id)
                    url = 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=%s' % (paper_id)

            if downloaded == "True":
                # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i)
                sql = "update resolved_papers set downloaded = 1 where id = %s" % (i)

                try:
                    cur.execute(sql)
                    db.commit()
                    print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded))
                except:
                    db.rollback()
            else: print("Id: %s. Downloaded: %s." % (i, downloaded))

    except UnicodeDecodeError:
        pass


    cur.close()
Ejemplo n.º 12
0
def downloadPDF(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        i, res_title, main_link, direct_link = ids


        p = False
        downloaded = "False"
        count = 0



        # to get the title language
        # sql = 'select title_language from resolved_papers_title where id = %s;' % (i)
        # cur.execute(sql)
        # res_title = cur.fetchall()[0][0]


        if res_title and res_title == "en":  # or not res_title:

            print(i)
            print(main_link)
            print(direct_link)

            # url = _getUrl(i, "direct_link")
            url = direct_link

            while downloaded == "False" and count < 2:
                count += 1
                if url:
                    s = DownloadPDF()
                    p = s.download(url,
                                   destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/',
                                   path=str(i) + '.pdf')
                if p == True:
                    downloaded = "True"
                else:
                    # url = _getUrl(i, "main_link")
                    url = main_link

            if downloaded == "True":
                # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i)
                sql = "update resolved_papers set downloaded = 1 where id = %s" % (i)

                try:
                    cur.execute(sql)
                    db.commit()
                    print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded))
                except:
                    db.rollback()
            else: print("Id: %s. Downloaded: %s." % (i, downloaded))

    except UnicodeDecodeError:
        pass


    cur.close()
Ejemplo n.º 13
0
def _filterTitle(papers):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:

        id, title = papers

        threshold = 1

        # title = 'A New Approach for Cross-Language Plagiarism Analysis.'.lower()
        title = title.lower()


        k_dflanguage = 0
        k_copy = 0
        k_detection = 0

        diff_language = ["Cross-language",
                         "Crosslanguage",
                         "Cross-lingual",
                         "Crosslingual",
                         "Cross-linguistic",
                         "Crosslinguistic",
                         "Multi-language",
                         "Multilanguage",
                         "Multi-lingual",
                         "Multilingual",
                         "Multi-linguistic",
                         "Multilinguistic",
                         "Machine-translation", ]

        copy = ["Copy",
                "Duplicate",
                "Plagiarism", ]

        detection = ["Detection",
                     "Discovery", ]

        for row in diff_language:
            if row.lower() in title:
                k_dflanguage += 1

        for row in copy:
            if row.lower() in title:
                k_copy += 1

        for row in detection:
            if row.lower() in title:
                k_detection += 1

        print("diff_language: %s." % (k_dflanguage))
        print("copy: %s." % (k_copy))
        print("detection: %s." % (k_detection))

        if (k_dflanguage >= threshold or k_detection >= threshold or k_detection >= threshold):
            # papers_selected.append(id, title)

            sql = "insert into resolved_papers_selected_title values (%s)" % (id)
            print(sql)
            # try:
            cur.execute(sql)
            db.commit()
            # except:
            db.rollback()

            return True
        else:
            return False
    except:
        db.rollback()
        print('no saved')
    cur.close()
def downloadPDF(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        i, main_link, direct_link = ids

        p = False
        downloaded = "False"
        count = 0

        print(i)
        print(main_link)
        print(direct_link)

        # url = _getUrl(i, "direct_link")
        url = direct_link

        while downloaded == "False" and count < 2:
            count += 1
            if url:
                # url = 'https://s3.amazonaws.com/academia.edu/download/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32'
                # https://s3.amazonaws.com/academia.edu.documents/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32
                url = "http://www.academia.edu/download/30761819/book.pdf#page=32"
                # http://www.academia.edu/download/30761819/book.pdf#page=32
                # url = 'http://google.com'
                i = 149
                destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/'
                path = destination + str(i) + '.pdf'

                try:

                    ua = UserAgent()
                    headers = {'User-Agent': str(ua.random)}

                    r = requests.head(
                        'http://www.academia.edu/download/30761819/book.pdf#page=32',
                        allow_redirects=True)
                    print(r.url)

                    s = requests.session()

                    res = s.get(url, headers=headers, allow_redirects=False)
                    print(res.url)
                    # print(finalurl)

                    p = urlretrieve(url, path)

                    if p[1].get_content_type() == 'application/pdf':
                        downloaded = "True"
                except:
                    pass
            else:
                # url = _getUrl(i, "main_link")
                url = main_link

        if downloaded == "True":
            # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i)
            sql = "update resolved_papers set downloaded = 1 where id = %s" % (
                i)

            try:
                cur.execute(sql)
                db.commit()
                print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded))
            except:
                db.rollback()
        else:
            print("Id: %s. Downloaded: %s." % (i, downloaded))

    except UnicodeDecodeError:
        pass

    cur.close()
def _countOccurencies(id, type):
    keywords = [
        "Cross-language", "Crosslanguage", "Cross-lingual", "Crosslingual",
        "Cross-linguistic", "Crosslinguistic", "Multi-language",
        "Multilanguage", "Multi-lingual", "Multilingual", "Multi-linguistic",
        "Multilinguistic", "Machine-translation", "Copy", "Duplicate",
        "Plagiarism", "Detection", "Discovery"
    ]
    nkeywords = len(keywords)
    text = ""
    # with open(os.path.join('data/txt', str(id) + '_head.txt')) as infile:
    with open(
            os.path.join('/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/txt',
                         str(id) + '_head.txt')) as infile:
        for line in infile:
            line = _processText(line)
            text += line

    words = _processNL(text)
    fdist = nltk.FreqDist(words)

    i = 0
    head = False
    while i < nkeywords:

        if fdist[str(keywords[i]).lower()] > 0:
            sql = "insert into resolved_papers_occurrenciesv4 values (%s, '%s', '%s', '%s', %s);" % (
                id, type, "head", str(keywords[i]).lower(), fdist[str(
                    keywords[i]).lower()])
            # print (sql)
            head = True
            try:
                cur.execute(sql)
                db.commit()
            except:
                db.rollback()
        i += 1
    #### tail
    text = ""
    # with open(os.path.join('data/txt', str(id) + '_tail_noreferences.txt')) as infile:
    with open(
            os.path.join('/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/txt',
                         str(id) + '_tail_noreferences.txt')) as infile:
        for line in infile:
            line = _processText(line)
            text += line
    words = _processNL(text)
    fdist = nltk.FreqDist(words)

    i = 0
    tail = False
    while i < nkeywords:

        if fdist[str(keywords[i]).lower()] > 0:
            sql = "insert into resolved_papers_occurrenciesv4 values (%s, '%s', '%s', '%s', %s);" % (
                id, type, "tail", str(keywords[i]).lower(), fdist[str(
                    keywords[i]).lower()])
            # print (sql)
            tail = True
            try:
                cur.execute(sql)
                db.commit()
            except:
                db.rollback()
        i += 1

    return ('Done', head, tail)