def splitFiles(type): # type = 'head' if type == 'head': sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and %s = 1 and id in (1702);' % ( type) else: sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and head = 1 and %s = 1 and id in (1702);' % ( type) print(sql) papers = pd.read_sql(sql, con=db) for index, row in papers.iterrows(): # row = [30092, 9,] if type == 'head': # print ('entra') res = _getHead(row['id'], row['npages']) elif type == 'tail': res = _getTail(row['id'], row['npages']) if res: sql = "update resolved_papers set %s = 1 where id = %s" % (type, row[0]) try: cur.execute(sql) db.commit() except: db.rollback() print("Id: %s. %s: %s" % (row[0], type.title(), res)) cur.close()
def _countOccurencies(papers): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, title = papers # title = 'CLEU‐A Cross‐Language English‐Urdu Corpus and Benchmark for Text Reuse Experiments' keywords = ["Cross-language".lower().strip(), "Crosslanguage".lower().strip(), "Cross-lingual".lower().strip(), "Crosslingual".lower().strip(), "Cross-linguistic".lower().strip(), "Crosslinguistic".lower().strip(), "Multi-language".lower().strip(), "Multilanguage".lower().strip(), "Multi-lingual".lower().strip(), "Multilingual".lower().strip(), "Multi-linguistic".lower().strip(), "Multilinguistic".lower().strip(), "Machine-translation".lower().strip(), "Copy".lower().strip(), "Duplicate".lower().strip(), "Plagiarism".lower().strip(), "Detection".lower().strip(), "Discovery".lower().strip()] nkeywords = len(keywords) text = _processText(title) words = _processNL(text) fdist = nltk.FreqDist(words) i = 0 while i < nkeywords: if fdist[str(keywords[i]).lower()] > 0: sql = "insert into resolved_papers_title_occurrencies values (%s, '%s', %s);" % ( id, str(keywords[i]).lower(), fdist[str(keywords[i]).lower()]) # print (sql) try: cur.execute(sql) db.commit() print('saved') except: db.rollback() i += 1 except: db.rollback() print('no saved') cur.close()
def classifyPub(): sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and head = 1 and tail = 1 and type is NULL;' papers = pd.read_sql(sql, con=db) for index, row in papers.iterrows(): res = _classifyPub(row[0], row[1]) if res: sql = "update resolved_papers set type = '%s' where id = %s" % ( res, row[0]) print(sql) try: cur.execute(sql) db.commit() except: db.rollback() print("Id: %s. type: %s" % (row[0], res)) cur.close()
def updateNumPages(): sql = 'select id from resolved_papers where downloaded = 1 and npages is NULL;' papers = pd.read_sql(sql, con=db) for index, row in papers.iterrows(): # print (row['id']) pages = _getNPages(row['id']) # print ((row['id'], pages)) if pages: sql = "update resolved_papers set npages = %s where id = %s" % ( pages, row[0]) try: cur.execute(sql) db.commit() except: db.rollback() print("Id: %s. Num Pages: %s" % (row[0], pages)) cur.close()
def _titlesLang(ids): # keep this connection in order to use multiprocessing db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, title = ids lang = _checkTitle(title) sql = "insert into resolved_papers_title values (%s, '%s');" % (id, lang) print(sql) cur.execute(sql) db.commit() print("saved") except: db.rollback() print('no saved') cur.close()
def languageDetection(): # sql = 'select id from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 0 and id in (12, 70, 74, 77, 92, 108, 110, 111, 113, 127, 128, 129, 133, 136, 145, 149, 151, 189, 210, 223, 238, 247, 253, 276, 287, 289, 291, 292, 303, 308, 345, 346, 347, 349, 350, 351, 354, 355, 359, 360, 361, 362, 363, 364, 365, 368, 377, 381, 389, 393, 395, 406, 414, 424, 439, 446, 448, 549, 554, 558, 574, 577, 578, 579, 581, 582, 583, 585, 588, 589, 591, 592, 595, 597, 601, 604, 605, 609, 613, 621, 625, 682, 684, 712, 713, 714, 715, 716, 717, 719, 722, 723, 724, 726, 730, 731, 732, 734, 735, 738, 739, 740, 743, 749, 751, 752, 753, 754, 755, 758, 765, 782, 787, 816, 822, 830, 836, 851, 857, 860, 861, 869, 882, 970, 1044, 1045, 1047, 1050, 1052, 1055, 1056, 1057, 1058, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1068, 1069, 1072, 1073, 1074, 1075, 1076, 1079, 1080, 1083, 1084, 1086, 1087, 1089, 1094, 1100, 1104, 1105, 1106, 1115, 1116, 1117, 1122, 1124, 1125, 1126, 1131, 1133, 1142, 1143, 1146, 1150, 1151, 1172, 1174, 1176, 1184, 1194, 1248, 1283, 1301, 1307, 1309, 1367, 1381, 1417, 1419, 1452, 1456, 1482, 1491, 1507, 1511, 1513, 1522, 1542, 1562, 1585, 1587, 1591, 1624, 1626, 1628, 1652, 1687, 1688, 1689, 1692, 1693, 1694, 1696, 1698, 1699, 1701, 1704, 1710, 1711, 1714, 1716, 1719, 1720, 1727, 1728, 1730, 1745, 1750, 1751, 1755, 1757, 1770, 1809, 1815, 1820, 1831, 1835, 1872, 1884, 1887, 1898, 1935, 1955, 1993, 2009, 2025, 2026, 2029, 2030, 2031, 2199, 2241, 2244, 2246, 2275, 2276, 2277, 2278, 2279, 2305, 2323, 2324, 2325, 2327, 2328, 2347, 2360, 2402, 2404, 2410, 2415, 2442, 2448, 2450, 2451, 2452, 2461, 2462, 2467, 2477, 2509, 2510, 2512, 2513, 2518, 2522, 2524, 2531, 2543, 2547, 2554, 2555, 2576, 2577, 2578, 2579, 2580, 2583, 2586, 2605, 2609, 2624, 2629, 2646, 2651, 2652, 2653, 2655, 2656, 2659, 2661, 2662, 2671, 2676, 2677, 2756, 2757, 2758, 2760, 2761, 2762, 2768, 2771, 2772, 2773, 2774, 2776, 2777, 2781, 2782, 2783, 2786, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2798, 2811, 2815, 2822, 2869, 2884, 2907, 2913, 2920, 2924, 3029, 3127, 3141, 3146, 3172, 3173, 3174, 3175, 3176, 3177, 3178, 3180, 3182, 3183, 3184, 3185, 3189, 3192, 3194, 3198, 3199, 3202, 3203, 3207, 3208, 3211, 3223, 3224, 3230, 3236, 3252, 3253, 3262, 3275, 3302, 3305, 3316, 3365, 3388, 3389, 3391, 3392, 3396, 3397, 3398, 3399, 3400, 3401, 3402, 3405, 3406, 3408, 3409, 3412, 3415, 3416, 3418, 3419, 3420, 3421, 3422, 3423, 3424, 3425, 3426, 3427, 3428, 3431, 3432, 3433, 3436, 3438, 3439, 3443, 3444, 3445, 3446, 3450, 3452, 3455, 3456, 3458, 3461, 3466, 3467, 3470, 3503, 3526, 3532, 3536, 3538, 3541, 3542, 3543, 3549, 3563, 3573, 3597, 3598, 3620, 3626, 3662, 3819, 3921, 3922, 3923, 3925, 3927, 3931, 3932, 3933, 3934, 3935, 3936, 3937, 3938, 3939, 3940, 3942, 3943, 3944, 3945, 3948, 3950, 3952, 3953, 3954, 3955, 3957, 3958, 3959, 3960, 3961, 3963, 3965, 3966, 3967, 3968, 3971, 3972, 3980, 3988, 3995, 4000, 4005, 4011, 4039, 4043, 4046, 4048, 4050, 4059, 4077, 4086, 4089, 4098, 4101, 4104, 4109, 4111, 4123, 4127, 4170, 4184, 4203, 4215, 4221, 4235, 4287, 4295, 4345, 4362, 4367, 4448, 4449, 4451, 4452, 4453, 4454, 4455, 4457, 4458, 4459, 4460, 4461, 4462, 4463, 4464, 4465, 4466, 4467, 4468, 4469, 4470, 4472, 4478, 4480, 4481, 4482, 4496, 4500, 4504, 4508, 4513, 4518, 4523, 4524, 4548, 4551, 4567, 4572, 4598, 4607, 4608, 4611, 4657, 4786, 4788, 4789, 4791, 4792, 4793, 4794, 4795, 4796, 4797, 4798, 4799, 4804, 4805, 4811, 4815, 4817, 4819, 4829, 4839, 4840, 5037, 5038, 5040, 5047, 5179, 5191, 5192, 5210, 5248, 5249, 5266, 5275, 5276, 5322, 5323, 5327, 5330, 5362, 5410, 5411, 5416, 5451, 5462, 5493, 5494, 5496, 5519, 5536, 5548, 5555, 5587, 5588, 5589, 5590, 5591, 5594, 5599, 5617, 5633, 5636, 5660, 5667, 5695, 5697, 5701, 5702, 5706, 5767, 5768, 5769, 5773, 5778, 5786, 5831, 5832, 5833, 5835, 5836, 5837, 5839, 5844, 5849, 5850, 5858, 5860, 5889, 5901, 5915, 5916, 5918, 5920, 5991, 5992, 5993, 5994, 5995, 6009, 6045, 6079, 6080, 6081, 6083, 6084, 6085, 6086, 6087, 6100, 6101, 6107, 6185, 6249, 6278, 6279, 6280, 6281, 6282, 6283, 6285, 6305, 6306, 6387, 6393, 6396, 6397, 6398, 6411, 6439, 6498, 6505, 6511, 6513, 6518, 6520, 6524, 6525, 6526, 6527, 6532, 6543, 6553, 6555, 6565, 6566, 6569, 6573, 6574, 6581, 6585, 6601, 6605, 6606, 6612, 6615, 6617, 6621, 6645, 6646, 6648, 6651, 6652, 6658, 6660, 6667, 6672, 6676, 6682, 6684, 6688, 6690, 6692, 6693, 6700, 6704, 6743, 6769, 6771, 6772, 6775, 6778, 6783, 6785, 6789, 6793, 6818, 6824, 6829, 6830, 6834, 6839, 6845, 6846, 6849, 6850, 6855, 6859, 6866, 6873, 6878, 6887, 6888, 6889, 6890, 6907, 6926, 6945, 6948, 6954, 6963, 7006, 7066, 7082, 7102, 7121, 7162, 7163, 7271, 7272, 7273, 7285, 7314, 7315, 7350, 7362, 7364, 7398, 7441, 7442, 7443, 7444, 7446, 7451, 7454, 7456, 7462, 7464, 7504, 7515, 7516, 7547, 7548, 7634, 7659, 7660, 7661, 7662, 7663, 7664, 7665, 7672, 7776, 7777, 7783, 7784, 7788, 7789, 7792, 7795, 7797, 7798, 7799, 7809, 7831, 7889, 7917, 7918, 7920, 7926, 7930, 7932, 7933, 7935, 7936, 7941, 7944, 7960, 7962, 7971, 8008, 8017, 8070, 8075, 8076, 8110, 8111, 8112, 8117, 8120, 8128, 8129, 8130, 8133, 8136, 8140, 8143, 8144, 8145, 8148, 8149, 8150, 8153, 8154, 8159, 8163, 8203, 8225, 8268, 8270, 8302, 8310, 8312, 8419, 8421, 8496, 8497, 8498, 8500, 8505, 8506, 8507, 8508, 8510, 8513, 8517, 8533, 8543, 8584, 8710, 8717, 8718, 8719, 8720, 8721, 8722, 8724, 8726, 8730, 8732, 8733, 8734, 8737, 8739, 8740, 8741, 8742, 8743, 8744, 8745, 8747, 8748, 8750, 8751, 8752, 8753, 8754, 8755, 8756, 8757, 8759, 8761, 8764, 8766, 8768, 8769, 8773, 8774, 8775, 8784, 8811, 8817, 9042, 9056, 9207, 9219, 9240, 9249, 9273, 9318, 9322, 9422, 9457, 9485, 9562, 9623, 9647, 9836, 9837, 9922, 10067, 10068, 10069, 10168, 10185, 10288, 10400, 10401, 10513, 10515, 10606, 10700, 10702, 10703, 10771, 10772, 10819, 10821, 10927, 11019, 11056, 11113, 11142, 11143, 11225, 11226, 11227, 11343, 11361, 11362, 11364, 11377, 11448, 11460, 11461, 11462, 11463, 11465, 11466, 11468, 11493, 11609, 11610, 11611, 11617, 11638, 11659, 11718, 11748, 11749, 11750, 11751, 11762, 11821, 11850, 11891, 11898, 11911, 11913, 11914, 11915, 11916, 11917, 11918, 11919, 11920, 11921, 11922, 11923, 11926, 11928, 11934, 11955, 11980, 12026, 12030, 12044, 12092, 12093, 12094, 12095, 12096, 12098, 12100, 12101, 12102, 12103, 12104, 12105, 12106, 12107, 12108, 12109, 12110, 12111, 12112, 12113, 12114, 12122, 12123, 12125, 12144, 12147, 12234, 12235, 12237, 12256, 12305, 12339, 12346, 12407, 12448, 12511, 12665, 12705, 12706, 12708, 12709, 12710, 12711, 12712, 12713, 12714, 12716, 12717, 12718, 12719, 12720, 12721, 12722, 12725, 12729, 12742, 12753, 12762, 12802, 12813, 12816, 12821, 12823, 12843, 12856, 12905, 12907, 13006, 13061, 13062, 13063, 13137, 13138, 13198, 13329, 13330, 13331, 13332, 13494, 13495, 13582, 13583, 13584, 13585, 13586, 13697, 13833, 13834, 13835, 13836, 13837, 13840, 14160, 14161, 14200, 14341, 14342, 14343, 14590, 14591, 14597, 14610, 14614, 14631, 14632, 14633, 14634, 14635, 14650, 14655, 14656, 14689, 14726, 14777, 14870, 14871, 14872, 14921, 14922, 14923, 14991, 14992, 14993, 14994, 14995, 15136, 15137, 15138, 15139, 15140, 15141, 15142, 15143, 15152, 15216, 15265, 15277, 15387, 15388, 15483, 15546, 15550, 15587, 15590, 15623, 15641, 15653, 15711, 15712, 15730, 15743, 15763, 15794, 15805, 15821, 15831, 15884, 15932, 16039, 16122, 16124, 16153, 16175, 16181, 16220, 16233, 16264, 16277, 16306, 16361, 16377, 16391, 16392, 16393, 16402, 16404, 16431, 16439, 16440, 16444, 16447, 16448, 16455, 16457, 16463, 16468, 16513, 16524, 16528, 16551, 16569, 16594, 16596, 16600, 16610, 16647, 16648, 16718, 16731, 16763, 16765, 16794, 16795, 16899, 16948, 16962, 16993, 16998, 17011, 17013, 17034, 17061, 17062, 17141, 17142, 17143, 17144, 17155, 17158, 17248, 17262, 17263, 17264, 17265, 17266, 17333, 17334, 17335, 17395, 17396, 17398, 17400, 17401, 17405, 17410, 17412, 17417, 17420, 17431, 17547, 17584, 17585, 17587, 17599, 17674, 17676, 17677, 17679, 17711, 17719, 17749, 17750, 17751, 17752, 17753, 17754, 17756, 17757, 17811, 17812, 17814, 17948, 17963, 17964, 17965, 17989, 17998, 18083, 18139, 18145, 18165, 18229, 18230, 18257, 18264, 18273, 18321, 18322, 18323, 18351, 18515, 18548, 18599, 18600, 18623, 18637, 18675, 18676, 18687, 18698, 18736, 18753, 18768, 18792, 18794, 18797, 18823, 18828, 18830, 18850, 18851, 18853, 18854, 18857, 18882, 18885, 18886, 18887, 18888, 18891, 18892, 18893, 18894, 18898, 18901, 18904, 18930, 18947, 18967, 18968, 18970, 18972, 18973, 18974, 18976, 18977, 18980, 18982, 18983, 18984, 18985, 18986, 18991, 19006, 19059, 19060, 19061, 19062, 19064, 19066, 19067, 19069, 19071, 19103, 19104, 19110, 19116, 19153, 19180, 19181, 19186, 19263, 19272, 19273, 19280, 19318, 19409, 19425, 19428, 19456, 19528, 19531, 19538, 19606, 19607, 19609, 19610, 19612, 19613, 19616, 19623, 19636, 19647, 19648, 19685, 19798, 19799, 19800, 19801, 19802, 19805, 19806, 19807, 19808, 19811, 19812, 19813, 19816, 19820, 19821, 19836, 19874, 19875, 19878, 19960, 19985, 20051, 20052, 20053, 20054, 20055, 20056, 20057, 20058, 20059, 20061, 20062, 20063, 20064, 20065, 20066, 20069, 20070, 20071, 20072, 20074, 20078, 20079, 20081, 20084, 20088, 20090, 20110, 20156, 20157, 20168, 20189, 20193, 20245, 20344, 20345, 20346, 20347, 20348, 20349, 20350, 20353, 20354, 20355, 20356, 20357, 20358, 20359, 20360, 20361, 20362, 20363, 20365, 20368, 20370, 20371, 20373, 20374, 20377, 20391, 20392, 20396, 20398, 20400, 20444, 20476, 20520, 20682, 20685, 20687, 20688, 20689, 20690, 20691, 20692, 20693, 20694, 20695, 20698, 20699, 20700, 20701, 20702, 20703, 20707, 20709, 20714, 20728, 20760, 20774, 20864, 20865, 20866, 20867, 20868, 20869, 20870, 20872, 20874, 20899, 20909, 20962, 21041, 21042, 21117, 21118, 21121, 21139, 21146, 21227, 21271, 21272, 21273, 21274, 21275, 21425, 21430, 21493, 21505, 21507, 21510, 21513, 21612, 21616, 21621, 21622, 21623, 21624, 21667, 21675, 21751, 21765, 21766, 21767, 21846, 21847, 21856, 21857, 21858, 21871, 21872, 21873, 21875, 21876, 21877, 21881, 21883, 21885, 21924, 21925, 21957, 21977, 21978, 21979, 21980, 21984, 21985, 21993, 21997, 21999, 22001, 22031, 22033, 22082, 22113, 22175, 22228, 22247, 22271, 22272, 22371, 22374, 22462, 22463, 22613, 22694, 22695, 22696, 22697, 22700, 22880, 22881, 22882, 22883, 22884, 22901, 22977, 22978, 22979, 22981, 23030, 23032, 23191, 23230, 23236, 23238, 23291, 23340, 23453, 23552, 23553, 23744, 23761, 23774, 24016, 24025, 24037, 24085, 24090, 24096, 24125, 24126, 24128, 24129, 24130, 24132, 24133, 24140, 24141, 24142, 24145, 24150, 24151, 24152, 24153, 24155, 24168, 24169, 24170, 24171, 24172, 24173, 24174, 24181, 24186, 24187, 24189, 24190, 24192, 24193, 24206, 24207, 24208, 24209, 24210, 24211, 24212, 24213, 24214, 24239, 24243, 24244, 24246, 24247, 24249, 24250, 24251, 24252, 24253, 24254, 24255, 24256, 24257, 24258, 24261, 24290, 24297, 24298, 24299, 24300, 24301, 24302, 24303, 24304, 24305, 24307, 24308, 24315, 24326, 24330, 24334, 24335, 24336, 24350, 24364, 24365, 24366, 24367, 24368, 24371, 24372, 24390, 24391, 24393, 24405, 24406, 24408, 24411, 24412, 24413, 24415, 24438, 24439, 24440, 24473, 24474, 24476, 24477, 24478, 24479, 24480, 24481, 24483, 24484, 24485, 24486, 24487, 24520, 24522, 24523, 24524, 24525, 24526, 24527, 24528, 24529, 24530, 24531, 24532, 24533, 24535, 24536, 24537, 24540, 24541, 24542, 24543, 24544, 24545, 24546, 24547, 24549, 24550, 24576, 24586, 24621, 24622, 24623, 24624, 24625, 24626, 24627, 24628, 24629, 24630, 24631, 24632, 24633, 24634, 24635, 24636, 24637, 24638, 24639, 24640, 24641, 24642, 24644, 24645, 24646, 24647, 24648, 24651, 24652, 24653, 24654, 24655, 24656, 24657, 24712, 24713, 24714, 24715, 24716, 24717, 24719, 24720, 24721, 24722, 24723, 24724, 24731, 24775, 24795, 24812, 24831, 24833, 24835, 24836, 24845, 24846, 24851, 24869, 24877, 24888, 24889, 24907, 24926, 24952, 25091, 25169, 25177, 25178, 25195, 25206, 25247, 25248, 25251, 25267, 25340, 25345, 25455, 25456, 25460, 25464, 25754, 25822, 25845, 25865, 25890, 25891, 25893, 25914, 25975, 25976, 25978, 25980, 25982, 25986, 25996, 26003, 26074, 26112, 26143, 26172, 26182, 26183, 26186, 26194, 26202, 26283, 26284, 26287, 26289, 26293, 26303, 26316, 26320, 26322, 26463, 26465, 26467, 26469, 26476, 26481, 26486, 26489, 26497, 26596, 26663, 26678, 26717, 27136, 27183, 27307, 27340, 27341, 27342, 27344, 27348, 27355, 27607, 27608, 27609, 27610, 27623, 27635, 27641, 27922, 27937, 28165, 28263, 28277, 28422, 28433, 28437, 28508, 28738, 28739, 28740, 28743, 28748, 28820, 28990, 28993, 28997, 29008, 29009, 29010, 29011, 29079, 29084, 29090, 29093, 29101, 29102, 29104, 29105, 29106, 29112, 29113, 29114, 29119, 29120, 29122, 29123, 29124, 29125, 29129, 29130, 29133, 29134, 29135, 29137, 29139, 29146, 29147, 29172, 29174, 29176, 29184, 29191, 29192, 29194, 29200, 29201, 29203, 29221, 29224, 29225, 29226, 29232, 29234, 29258, 29265, 29268, 29273, 29274, 29275, 29276, 29277, 29278, 29280, 29281, 29282, 29300, 29301, 29302, 29310, 29313, 29314, 29315, 29316, 29320, 29382, 29435, 29436, 29454, 29457, 29458, 29468, 29469, 29470, 29473, 29475, 29476, 29477, 29481, 29482, 29483, 29485, 29500, 29501, 29503, 29504, 29505, 29508, 29513, 29515, 29524, 29532, 29533, 29534, 29535, 29537, 29549, 29553, 29556, 29561, 29574, 29618, 29634, 29635, 29637, 29639, 29665, 29666, 29668, 29669, 29672, 29682, 29693, 29709, 29710, 29711, 29717, 29741, 29742, 29746, 29747, 29752, 29753, 29755, 29756, 29759, 29804, 29805, 29832, 29998, 30003, 30005, 30006, 30007, 30009, 30019, 30025, 30040, 30074, 30075, 30077, 30078, 30080, 30082, 30083, 30084, 30290, 30291, 30293, 30349, 30350, 30351, 30352, 30353, 30354, 30358, 30376, 30392, 30424, 30426, 30589, 30590, 30591, 30613, 30614, 30615, 30616, 30617, 30619, 30627, 30628, 30647, 30954, 30958, 30985, 30986, 31316, 31317, 31331, 31334, 31336, 31357, 31358, 31359, 31360, 31497, 31501, 31502, 31503, 31504, 31526, 31527, 31528, 31882, 31883, 31884, 31890, 31891, 31892, 31893, 31894, 31929, 31966, 31970, 32153, 32498, 32520, 32583, 32618, 32683, 32769, 32780, 32788, 32847, 32848, 32857, 32872, 33058, 33148, 33153, 33255, 33275, 33279, 33300, 33513, 33519, 33520, 33521, 33522, 33524, 33525, 33527, 33528, 33534, 33578, 33579, 33580, 33581, 33582, 33584, 33585, 33586, 33587, 33589, 33591, 33593, 33594, 33599, 33600, 33602, 33619, 33634, 33655, 33753, 33845, 33846, 33866, 33868, 33869, 33871, 33873, 33883, 33888, 33890, 33891, 33907, 33926, 33931, 33933, 33934, 33936, 33972, 33973, 33978, 33987, 33988, 33989, 33990, 33991, 33992, 33993, 33997, 33998, 34000, 34001, 34007, 34015, 34050, 34058, 34081, 34082, 34085, 34086, 34089, 34091, 34092, 34095, 34260, 34265, 34293, 34294, 34295, 34296, 34297, 34309, 34315, 34316, 34320, 34346, 34399, 34419, 34461, 34462, 34463, 34464, 34465, 34469, 34503, 34527, 34590, 34816, 34827, 34845, 34846, 34849, 34852, 34853, 34863, 34941, 34971, 35015, 35020, 35134, 35136, 35144, 35156, 35206, 35221, 35264, 35285, 35292, 35294, 35295, 35296, 35299, 35300, 35301, 35309, 35311, 35315, 35321, 35323, 35324, 35328, 35329, 35330, 35331, 35332, 35342, 35343, 35347, 35351, 35356, 35357, 35386, 35415, 35428, 35440, 35459, 35467, 35471, 35474, 35529, 35562, 35575, 35634, 35637, 35646, 35655, 35663, 35691, 35704, 35732, 35733, 35744, 35835, 35853, 35881, 35884, 35887, 35889, 35893, 35894, 35896, 35897, 35898, 35899, 35900, 35901, 35902, 35907, 35909, 35910, 35917, 35918, 35920, 35921, 35923, 35926, 35928, 35929, 35930, 35939, 35941, 35943, 35944, 35948, 35949, 35950, 35951, 35953, 35954, 35957, 35979, 35997, 35998, 36000, 36018, 36021, 36023, 36089, 36093, 36098, 36099, 36102, 36105, 36111, 36136, 36154, 36172, 36173, 36175, 36193, 36200, 36210, 36223, 36225, 36226, 36229, 36230, 36233, 36239, 36240, 36241, 36242, 36244, 36246, 36247, 36248, 36249, 36258, 36264, 36267, 36269, 36370, 36433, 36437, 36469, 36479, 36480, 36481, 36504, 36515, 36520, 36521, 36529, 36530, 36550, 36584, 36599, 36600, 36608, 36614, 36666, 36674, 36685, 36707, 36717, 36736, 36743, 36756, 36760, 36775, 36784, 36785, 36787, 36804, 36830, 36843, 36844, 36850, 36854, 36860, 36870, 36874, 36875, 36876, 36877, 36879, 36952, 36958, 36979, 36980, 36991, 36996, 37050, 37051, 37058, 37092, 37093, 37111, 37117, 37120, 37123, 37137, 37142, 37147, 37148, 37149, 37150, 37151, 37152, 37170, 37176, 37187, 37190, 37192, 37193, 37198, 37201, 37205, 37209, 37217, 37221, 37226, 37227, 37231, 37242, 37244, 37255, 37266, 37319, 37324, 37352, 37365, 37375, 37415, 37429, 37448, 37450, 37452, 37495, 37518, 37519, 37569, 37570, 37572, 37573, 37576, 37597, 37608, 37627, 37676, 37677, 37735, 37743, 37748, 37749, 37750, 37751, 37756, 37758, 37766, 37767, 37792, 37801, 37805, 37807, 37808, 37812, 37828, 37834, 37835, 37838, 37840, 37841, 37842, 37843, 37844, 37845, 37846, 37849, 37850, 37852, 37854, 37863, 37866, 37873, 37877, 37880, 37881, 37883, 37897, 37900, 37908, 37927, 37996, 38008, 38081, 38085, 38091, 38092, 38161, 38183, 38187, 38195, 38200, 38282, 38292, 38300, 38302, 38303, 38309, 38314, 38316, 38317, 38321, 38360, 38368, 38374, 38382, 38398, 38399, 38402, 38403, 38410, 38411, 38420, 38429, 38431, 38439, 38452, 38464, 38467, 38483, 38499, 38500, 38514, 38515, 38530, 38533, 38547, 38548, 38556, 38558, 38559, 38560, 38561, 38563, 38564, 38565, 38566, 38567, 38568, 38569, 38571, 38574, 38575, 38578, 38619, 38635);' sql = 'select id from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 0 and id in (12, 70, 74, 77, 92, 108, 110, 111, 113, 127, 128, 129, 133, 136, 145, 149, 151, 189, 210, 223, 238, 247, 253, 276, 287, 289, 291, 292, 303, 308, 345, 346, 347, 349, 350, 351, 354, 355, 359, 360, 361, 362, 363, 364, 365, 368, 377, 381, 389, 393, 395, 406, 414, 424, 439, 446, 448, 549, 554, 558, 574, 577, 578, 579, 581, 582, 583, 585, 588, 589, 591, 592, 595, 597, 601, 604, 605, 609, 613, 621, 625, 682, 684, 712, 713, 714, 715, 716, 717, 719, 722, 723, 724, 726, 730, 731, 732, 734, 735, 738, 739, 740, 743, 749, 751, 752, 753, 754, 755, 758, 765, 782, 787, 816, 822, 830, 836, 851, 857, 860, 861, 869, 882, 970, 1044, 1045, 1047, 1050, 1052, 1055, 1056, 1057, 1058, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1068, 1069, 1072, 1073, 1074, 1075, 1076, 1079, 1080, 1083, 1084, 1086, 1087, 1089, 1094, 1100, 1104, 1105, 1106, 1115, 1116, 1117, 1122, 1124, 1125, 1126, 1131, 1133, 1142, 1143, 1146, 1150, 1151, 1172, 1174, 1176, 1184, 1194, 1248, 1283, 1301, 1307, 1309, 1367, 1381, 1417, 1419, 1452, 1456, 1482, 1491, 1507, 1511, 1513, 1522, 1542, 1562, 1585, 1587, 1591, 1624, 1626, 1628, 1652, 1687, 1688, 1689, 1692, 1693, 1694, 1696, 1698, 1699, 1701, 1704, 1710, 1711, 1714, 1716, 1719, 1720, 1727, 1728, 1730, 1745, 1750, 1751, 1755, 1757, 1770, 1809, 1815, 1820, 1831, 1835, 1872, 1884, 1887, 1898, 1935, 1955, 1993, 2009, 2025, 2026, 2029, 2030, 2031, 2199, 2241, 2244, 2246, 2275, 2276, 2277, 2278, 2279, 2305, 2323, 2324, 2325, 2327, 2328, 2347, 2360, 2402, 2404, 2410, 2415, 2442, 2448, 2450, 2451, 2452, 2461, 2462, 2467, 2477, 2509, 2510, 2512, 2513, 2518, 2522, 2524, 2531, 2543, 2547, 2554, 2555, 2576, 2577, 2578, 2579, 2580, 2583, 2586, 2605, 2609, 2624, 2629, 2646, 2651, 2652, 2653, 2655, 2656, 2659, 2661, 2662, 2671, 2676, 2677, 2756, 2757, 2758, 2760, 2761, 2762, 2768, 2771, 2772, 2773, 2774, 2776, 2777, 2781, 2782, 2783, 2786, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2798, 2811, 2815, 2822, 2869, 2884, 2907, 2913, 2920, 2924, 3029, 3127, 3141, 3146, 3172, 3173, 3174, 3175, 3176, 3177, 3178, 3180, 3182, 3183, 3184, 3185, 3189, 3192, 3194, 3198, 3199, 3202, 3203, 3207, 3208, 3211, 3223, 3224, 3230, 3236, 3252, 3253, 3262, 3275, 3302, 3305, 3316, 3365, 3388, 3389, 3391, 3392, 3396, 3397, 3398, 3399, 3400, 3401, 3402, 3405, 3406, 3408, 3409, 3412, 3415, 3416, 3418, 3419, 3420, 3421, 3422, 3423, 3424, 3425, 3426, 3427, 3428, 3431, 3432, 3433, 3436, 3438, 3439, 3443, 3444, 3445, 3446, 3450, 3452, 3455, 3456, 3458, 3461, 3466, 3467, 3470, 3503, 3526, 3532, 3536, 3538, 3541, 3542, 3543, 3549, 3563, 3573, 3597, 3598, 3620, 3626, 3662, 3819, 3921, 3922, 3923, 3925, 3927, 3931, 3932, 3933, 3934, 3935, 3936, 3937, 3938, 3939, 3940, 3942, 3943, 3944, 3945, 3948, 3950, 3952, 3953, 3954, 3955, 3957, 3958, 3959, 3960, 3961, 3963, 3965, 3966, 3967, 3968, 3971, 3972, 3980, 3988, 3995, 4000, 4005, 4011, 4039, 4043, 4046, 4048, 4050, 4059, 4077, 4086, 4089, 4098, 4101, 4104, 4109, 4111, 4123, 4127, 4170, 4184, 4203, 4215, 4221, 4235, 4287, 4295, 4345, 4362, 4367, 4448, 4449, 4451, 4452, 4453, 4454, 4455, 4457, 4458, 4459, 4460, 4461, 4462, 4463, 4464, 4465, 4466, 4467, 4468, 4469, 4470, 4472, 4478, 4480, 4481, 4482, 4496, 4500, 4504, 4508, 4513, 4518, 4523, 4524, 4548, 4551, 4567, 4572, 4598, 4607, 4608, 4611, 4657, 4786, 4788, 4789, 4791, 4792, 4793, 4794, 4795, 4796, 4797, 4798, 4799, 4804, 4805, 4811, 4815, 4817, 4819, 4829, 4839, 4840, 5037, 5038, 5040, 5047, 5179, 5191, 5192, 5210, 5248, 5249, 5266, 5275, 5276, 5322, 5323, 5327, 5330, 5362, 5410, 5411, 5416, 5451, 5462, 5493, 5494, 5496, 5519, 5536, 5548, 5555, 5587, 5588, 5589, 5590, 5591, 5594, 5599, 5617, 5633, 5636, 5660, 5667, 5695, 5697, 5701, 5702, 5706, 5767, 5768, 5769, 5773, 5778, 5786, 5831, 5832, 5833, 5835, 5836, 5837, 5839, 5844, 5849, 5850, 5858, 5860, 5889, 5901, 5915, 5916, 5918, 5920, 5991, 5992, 5993, 5994, 5995, 6009, 6045, 6079, 6080, 6081, 6083, 6084, 6085, 6086, 6087, 6100, 6101, 6107, 6185, 6249, 6278, 6279, 6280, 6281, 6282, 6283, 6285, 6305, 6306, 6387, 6393, 6396, 6397, 6398, 6411, 6439, 6498, 6505, 6511, 6513, 6518, 6520, 6524, 6525, 6526, 6527, 6532, 6543, 6553, 6555, 6565, 6566, 6569, 6573, 6574, 6581, 6585, 6601, 6605, 6606, 6612, 6615, 6617, 6621, 6645, 6646, 6648, 6651, 6652, 6658, 6660, 6667, 6672, 6676, 6682, 6684, 6688, 6690, 6692, 6693, 6700, 6704, 6743, 6769, 6771, 6772, 6775, 6778, 6783, 6785, 6789, 6793, 6818, 6824, 6829, 6830, 6834, 6839, 6845, 6846, 6849, 6850, 6855, 6859, 6866, 6873, 6878, 6887, 6888, 6889, 6890, 6907, 6926, 6945, 6948, 6954, 6963, 7006, 7066, 7082, 7102, 7121, 7162, 7163, 7271, 7272, 7273, 7285, 7314, 7315, 7350, 7362, 7364, 7398, 7441, 7442, 7443, 7444, 7446, 7451, 7454, 7456, 7462, 7464, 7504, 7515, 7516, 7547, 7548, 7634, 7659, 7660, 7661, 7662, 7663, 7664, 7665, 7672, 7776, 7777, 7783, 7784, 7788, 7789, 7792, 7795, 7797, 7798, 7799, 7809, 7831, 7889, 7917, 7918, 7920, 7926, 7930, 7932, 7933, 7935, 7936, 7941, 7944, 7960, 7962, 7971, 8008, 8017, 8070, 8075, 8076, 8110, 8111, 8112, 8117, 8120, 8128, 8129, 8130, 8133, 8136, 8140, 8143, 8144, 8145, 8148, 8149, 8150, 8153, 8154, 8159, 8163, 8203, 8225, 8268, 8270, 8302, 8310, 8312, 8419, 8421, 8496, 8497, 8498, 8500, 8505, 8506, 8507, 8508, 8510, 8513, 8517, 8533, 8543, 8584, 8710, 8717, 8718, 8719, 8720, 8721, 8722, 8724, 8726, 8730, 8732, 8733, 8734, 8737, 8739, 8740, 8741, 8742, 8743, 8744, 8745, 8747, 8748, 8750, 8751, 8752, 8753, 8754, 8755, 8756, 8757, 8759, 8761, 8764, 8766, 8768, 8769, 8773, 8774, 8775, 8784, 8811, 8817, 9042, 9056, 9207, 9219, 9240, 9249, 9273, 9318, 9322, 9422, 9457, 9485, 9562, 9623, 9647, 9836, 9837, 9922, 10067, 10068, 10069, 10168, 10185, 10288, 10400, 10401, 10513, 10515, 10606, 10700, 10702, 10703, 10771, 10772, 10819, 10821, 10927, 11019, 11056, 11113, 11142, 11143, 11225, 11226, 11227, 11343, 11361, 11362, 11364, 11377, 11448, 11460, 11461, 11462, 11463, 11465, 11466, 11468, 11493, 11609, 11610, 11611, 11617, 11638, 11659, 11718, 11748, 11749, 11750, 11751, 11762, 11821, 11850, 11891, 11898, 11911, 11913, 11914, 11915, 11916, 11917, 11918, 11919, 11920, 11921, 11922, 11923, 11926, 11928, 11934, 11955, 11980, 12026, 12030, 12044, 12092, 12093, 12094, 12095, 12096, 12098, 12100, 12101, 12102, 12103, 12104, 12105, 12106, 12107, 12108, 12109, 12110, 12111, 12112, 12113, 12114, 12122, 12123, 12125, 12144, 12147, 12234, 12235, 12237, 12256, 12305, 12339, 12346, 12407, 12448, 12511, 12665, 12705, 12706, 12708, 12709, 12710, 12711, 12712, 12713, 12714, 12716, 12717, 12718, 12719, 12720, 12721, 12722, 12725, 12729, 12742, 12753, 12762, 12802, 12813, 12816, 12821, 12823, 12843, 12856, 12905, 12907, 13006, 13061, 13062, 13063, 13137, 13138, 13198, 13329, 13330, 13331, 13332, 13494, 13495, 13582, 13583, 13584, 13585, 13586, 13697, 13833, 13834, 13835, 13836, 13837, 13840, 14160, 14161, 14200, 14341, 14342, 14343, 14590, 14591, 14597, 14610, 14614, 14631, 14632, 14633, 14634, 14635, 14650, 14655, 14656, 14689, 14726, 14777, 14870, 14871, 14872, 14921, 14922, 14923, 14991, 14992, 14993, 14994, 14995, 15136, 15137, 15138, 15139, 15140, 15141, 15142, 15143, 15152, 15216, 15265, 15277, 15387, 15388, 15483, 15546, 15550, 15587, 15590, 15623, 15641, 15653, 15711, 15712, 15730, 15743, 15763, 15794, 15805, 15821, 15831, 15884, 15932, 16039, 16122, 16124, 16153, 16175, 16181, 16220, 16233, 16264, 16277, 16306, 16361, 16377, 16391, 16392, 16393, 16402, 16404, 16431, 16439, 16440, 16444, 16447, 16448, 16455, 16457, 16463, 16468, 16513, 16524, 16528, 16551, 16569, 16594, 16596, 16600, 16610, 16647, 16648, 16718, 16731, 16763, 16765, 16794, 16795, 16899, 16948, 16962, 16993, 16998, 17011, 17013, 17034, 17061, 17062, 17141, 17142, 17143, 17144, 17155, 17158, 17248, 17262, 17263, 17264, 17265, 17266, 17333, 17334, 17335, 17395, 17396, 17398, 17400, 17401, 17405, 17410, 17412, 17417, 17420, 17431, 17547, 17584, 17585, 17587, 17599, 17674, 17676, 17677, 17679, 17711, 17719, 17749, 17750, 17751, 17752, 17753, 17754, 17756, 17757, 17811, 17812, 17814, 17948, 17963, 17964, 17965, 17989, 17998, 18083, 18139, 18145, 18165, 18229, 18230, 18257, 18264, 18273, 18321, 18322, 18323, 18351, 18515, 18548, 18599, 18600, 18623, 18637, 18675, 18676, 18687, 18698, 18736, 18753, 18768, 18792, 18794, 18797, 18823, 18828, 18830, 18850, 18851, 18853, 18854, 18857, 18882, 18885, 18886, 18887, 18888, 18891, 18892, 18893, 18894, 18898, 18901, 18904, 18930, 18947, 18967, 18968, 18970, 18972, 18973, 18974, 18976, 18977, 18980, 18982, 18983, 18984, 18985, 18986, 18991, 19006, 19059, 19060, 19061, 19062, 19064, 19066, 19067, 19069, 19071, 19103, 19104, 19110, 19116, 19153, 19180, 19181, 19186, 19263, 19272, 19273, 19280, 19318, 19409, 19425, 19428, 19456, 19528, 19531, 19538, 19606, 19607, 19609, 19610, 19612, 19613, 19616, 19623, 19636, 19647, 19648, 19685, 19798, 19799, 19800, 19801, 19802, 19805, 19806, 19807, 19808, 19811, 19812, 19813, 19816, 19820, 19821, 19836, 19874, 19875, 19878, 19960, 19985, 20051, 20052, 20053, 20054, 20055, 20056, 20057, 20058, 20059, 20061, 20062, 20063, 20064, 20065, 20066, 20069, 20070, 20071, 20072, 20074, 20078, 20079, 20081, 20084, 20088, 20090, 20110, 20156, 20157, 20168, 20189, 20193, 20245, 20344, 20345, 20346, 20347, 20348, 20349, 20350, 20353, 20354, 20355, 20356, 20357, 20358, 20359, 20360, 20361, 20362, 20363, 20365, 20368, 20370, 20371, 20373, 20374, 20377, 20391, 20392, 20396, 20398, 20400, 20444, 20476, 20520, 20682, 20685, 20687, 20688, 20689, 20690, 20691, 20692, 20693, 20694, 20695, 20698, 20699, 20700, 20701, 20702, 20703, 20707, 20709, 20714, 20728, 20760, 20774, 20864, 20865, 20866, 20867, 20868, 20869, 20870, 20872, 20874, 20899, 20909, 20962, 21041, 21042, 21117, 21118, 21121, 21139, 21146, 21227, 21271, 21272, 21273, 21274, 21275, 21425, 21430, 21493, 21505, 21507, 21510, 21513, 21612, 21616, 21621, 21622, 21623, 21624, 21667, 21675, 21751, 21765, 21766, 21767, 21846, 21847, 21856, 21857, 21858, 21871, 21872, 21873, 21875, 21876, 21877, 21881, 21883, 21885, 21924, 21925, 21957, 21977, 21978, 21979, 21980, 21984, 21985, 21993, 21997, 21999, 22001, 22031, 22033, 22082, 22113, 22175, 22228, 22247, 22271, 22272, 22371, 22374, 22462, 22463, 22613, 22694, 22695, 22696, 22697, 22700, 22880, 22881, 22882, 22883, 22884, 22901, 22977, 22978, 22979, 22981, 23030, 23032, 23191, 23230, 23236, 23238, 23291, 23340, 23453, 23552, 23553, 23744, 23761, 23774, 24016, 24025, 24037, 24085, 24090, 24096, 24125, 24126, 24128, 24129, 24130, 24132, 24133, 24140, 24141, 24142, 24145, 24150, 24151, 24152, 24153, 24155, 24168, 24169, 24170, 24171, 24172, 24173, 24174, 24181, 24186, 24187, 24189, 24190, 24192, 24193, 24206, 24207, 24208, 24209, 24210, 24211, 24212, 24213, 24214, 24239, 24243, 24244, 24246, 24247, 24249, 24250, 24251, 24252, 24253, 24254, 24255, 24256, 24257, 24258, 24261, 24290, 24297, 24298, 24299, 24300, 24301, 24302, 24303, 24304, 24305, 24307, 24308, 24315, 24326, 24330, 24334, 24335, 24336, 24350, 24364, 24365, 24366, 24367, 24368, 24371, 24372, 24390, 24391, 24393, 24405, 24406, 24408, 24411, 24412, 24413, 24415, 24438, 24439, 24440, 24473, 24474, 24476, 24477, 24478, 24479, 24480, 24481, 24483, 24484, 24485, 24486, 24487, 24520, 24522, 24523, 24524, 24525, 24526, 24527, 24528, 24529, 24530, 24531, 24532, 24533, 24535, 24536, 24537, 24540, 24541, 24542, 24543, 24544, 24545, 24546, 24547, 24549, 24550, 24576, 24586, 24621, 24622, 24623, 24624, 24625, 24626, 24627, 24628, 24629, 24630, 24631, 24632, 24633, 24634, 24635, 24636, 24637, 24638, 24639, 24640, 24641, 24642, 24644, 24645, 24646, 24647, 24648, 24651, 24652, 24653, 24654, 24655, 24656, 24657, 24712, 24713, 24714, 24715, 24716, 24717, 24719, 24720, 24721, 24722, 24723, 24724, 24731, 24775, 24795, 24812, 24831, 24833, 24835, 24836, 24845, 24846, 24851, 24869, 24877, 24888, 24889, 24907, 24926, 24952, 25091, 25169, 25177, 25178, 25195, 25206, 25247, 25248, 25251, 25267, 25340, 25345, 25455, 25456, 25460, 25464, 25754, 25822, 25845, 25865, 25890, 25891, 25893, 25914, 25975, 25976, 25978, 25980, 25982, 25986, 25996, 26003, 26074, 26112, 26143, 26172, 26182, 26183, 26186, 26194, 26202, 26283, 26284, 26287, 26289, 26293, 26303, 26316, 26320, 26322, 26463, 26465, 26467, 26469, 26476, 26481, 26486, 26489, 26497, 26596, 26663, 26678, 26717, 27136, 27183, 27307, 27340, 27341, 27342, 27344, 27348, 27355, 27607, 27608, 27609, 27610, 27623, 27635, 27641, 27922, 27937, 28165, 28263, 28277, 28422, 28433, 28437, 28508, 28738, 28739, 28740, 28743, 28748, 28820, 28990, 28993, 28997, 29008, 29009, 29010, 29011, 29079, 29084, 29090, 29093, 29101, 29102, 29104, 29105, 29106, 29112, 29113, 29114, 29119, 29120, 29122, 29123, 29124, 29125, 29129, 29130, 29133, 29134, 29135, 29137, 29139, 29146, 29147, 29172, 29174, 29176, 29184, 29191, 29192, 29194, 29200, 29201, 29203, 29221, 29224, 29225, 29226, 29232, 29234, 29258, 29265, 29268, 29273, 29274, 29275, 29276, 29277, 29278, 29280, 29281, 29282, 29300, 29301, 29302, 29310, 29313, 29314, 29315, 29316, 29320, 29382, 29435, 29436, 29454, 29457, 29458, 29468, 29469, 29470, 29473, 29475, 29476, 29477, 29481, 29482, 29483, 29485, 29500, 29501, 29503, 29504, 29505, 29508, 29513, 29515, 29524, 29532, 29533, 29534, 29535, 29537, 29549, 29553, 29556, 29561, 29574, 29618, 29634, 29635, 29637, 29639, 29665, 29666, 29668, 29669, 29672, 29682, 29693, 29709, 29710, 29711, 29717, 29741, 29742, 29746, 29747, 29752, 29753, 29755, 29756, 29759, 29804, 29805, 29832, 29998, 30003, 30005, 30006, 30007, 30009, 30019, 30025, 30040, 30074, 30075, 30077, 30078, 30080, 30082, 30083, 30084, 30290, 30291, 30293, 30349, 30350, 30351, 30352, 30353, 30354, 30358, 30376, 30392, 30424, 30426, 30589, 30590, 30591, 30613, 30614, 30615, 30616, 30617, 30619, 30627, 30628, 30647, 30954, 30958, 30985, 30986, 31316, 31317, 31331, 31334, 31336, 31357, 31358, 31359, 31360, 31497, 31501, 31502, 31503, 31504, 31526, 31527, 31528, 31882, 31883, 31884, 31890, 31891, 31892, 31893, 31894, 31929, 31966, 31970, 32153, 32498, 32520, 32583, 32618, 32683, 32769, 32780, 32788, 32847, 32848, 32857, 32872, 33058, 33148, 33153, 33255, 33275, 33279, 33300, 33513, 33519, 33520, 33521, 33522, 33524, 33525, 33527, 33528, 33534, 33578, 33579, 33580, 33581, 33582, 33584, 33585, 33586, 33587, 33589, 33591, 33593, 33594, 33599, 33600, 33602, 33619, 33634, 33655, 33753, 33845, 33846, 33866, 33868, 33869, 33871, 33873, 33883, 33888, 33890, 33891, 33907, 33926, 33931, 33933, 33934, 33936, 33972, 33973, 33978, 33987, 33988, 33989, 33990, 33991, 33992, 33993, 33997, 33998, 34000, 34001, 34007, 34015, 34050, 34058, 34081, 34082, 34085, 34086, 34089, 34091, 34092, 34095, 34260, 34265, 34293, 34294, 34295, 34296, 34297, 34309, 34315, 34316, 34320, 34346, 34399, 34419, 34461, 34462, 34463, 34464, 34465, 34469, 34503, 34527, 34590, 34816, 34827, 34845, 34846, 34849, 34852, 34853, 34863, 34941, 34971, 35015, 35020, 35134, 35136, 35144, 35156, 35206, 35221, 35264, 35285, 35292, 35294, 35295, 35296, 35299, 35300, 35301, 35309, 35311, 35315, 35321, 35323, 35324, 35328, 35329, 35330, 35331, 35332, 35342, 35343, 35347, 35351, 35356, 35357, 35386, 35415, 35428, 35440, 35459, 35467, 35471, 35474, 35529, 35562, 35575, 35634, 35637, 35646, 35655, 35663, 35691, 35704, 35732, 35733, 35744, 35835, 35853, 35881, 35884, 35887, 35889, 35893, 35894, 35896, 35897, 35898, 35899, 35900, 35901, 35902, 35907, 35909, 35910, 35917, 35918, 35920, 35921, 35923, 35926, 35928, 35929, 35930, 35939, 35941, 35943, 35944, 35948, 35949, 35950, 35951, 35953, 35954, 35957, 35979, 35997, 35998, 36000, 36018, 36021, 36023, 36089, 36093, 36098, 36099, 36102, 36105, 36111, 36136, 36154, 36172, 36173, 36175, 36193, 36200, 36210, 36223, 36225, 36226, 36229, 36230, 36233, 36239, 36240, 36241, 36242, 36244, 36246, 36247, 36248, 36249, 36258, 36264, 36267, 36269, 36370, 36433, 36437, 36469, 36479, 36480, 36481, 36504, 36515, 36520, 36521, 36529, 36530, 36550, 36584, 36599, 36600, 36608, 36614, 36666, 36674, 36685, 36707, 36717, 36736, 36743, 36756, 36760, 36775, 36784, 36785, 36787, 36804, 36830, 36843, 36844, 36850, 36854, 36860, 36870, 36874, 36875, 36876, 36877, 36879, 36952, 36958, 36979, 36980, 36991, 36996, 37050, 37051, 37058, 37092, 37093, 37111, 37117, 37120, 37123, 37137, 37142, 37147, 37148, 37149, 37150, 37151, 37152, 37170, 37176, 37187, 37190, 37192, 37193, 37198, 37201, 37205, 37209, 37217, 37221, 37226, 37227, 37231, 37242, 37244, 37255, 37266, 37319, 37324, 37352, 37365, 37375, 37415, 37429, 37448, 37450, 37452, 37495, 37518, 37519, 37569, 37570, 37572, 37573, 37576, 37597, 37608, 37627, 37676, 37677, 37735, 37743, 37748, 37749, 37750, 37751, 37756, 37758, 37766, 37767, 37792, 37801, 37805, 37807, 37808, 37812, 37828, 37834, 37835, 37838, 37840, 37841, 37842, 37843, 37844, 37845, 37846, 37849, 37850, 37852, 37854, 37863, 37866, 37873, 37877, 37880, 37881, 37883, 37897, 37900, 37908, 37927, 37996, 38008, 38081, 38085, 38091, 38092, 38161, 38183, 38187, 38195, 38200, 38282, 38292, 38300, 38302, 38303, 38309, 38314, 38316, 38317, 38321, 38360, 38368, 38374, 38382, 38398, 38399, 38402, 38403, 38410, 38411, 38420, 38429, 38431, 38439, 38452, 38464, 38467, 38483, 38499, 38500, 38514, 38515, 38530, 38533, 38547, 38548, 38556, 38558, 38559, 38560, 38561, 38563, 38564, 38565, 38566, 38567, 38568, 38569, 38571, 38574, 38575, 38578, 38619, 38635);' print(sql) papers = pd.read_sql(sql, con=db) for index, row in papers.iterrows(): lang = None id = row[0] english = 0 other = 0 text = "" res = "" print(id) if id: # with open(os.path.join('data/txt', str(id) + '.txt')) as infile: with open( os.path.join( '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/txt', str(id) + '.txt')) as infile: for line in infile: if not re.match(r'^\s*$', line): line = re.sub(r"-\n", "", line) line = re.sub(r"\n", " ", line) text += line infile.close() lenText = len(text) nrequest = round(float(lenText) / 5000) count = 1 while count <= nrequest: res = '' content = "" posIni = (count * 5000) - 5000 posFin = (count * 5000) - 1 content += text[posIni:posFin] try: translator = Translator(random.choice(key_choices)) res = translator.detect_lang([content]) except: pass if res: if res == 'en': english += 1 else: other += 1 count += 1 if english > other: lang = "English" sql = "update resolved_papers set english = 1 where id = %s" % ( id) else: lang = "Other" try: cur.execute(sql) db.commit() except: db.rollback() print("Id: %s. Language: %s" % (id, lang)) print("Done!")
def _downloadIEEE(): db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() sql = "SELECT p.id, p.main_link, p.direct_link FROM `resolved_papers` p inner join `resolved_papers_title` pt on pt.Id = p.Id where p.source like '%ieee%' and p.downloaded = 0 and pt.`title_language` = 'en';" papers = pd.read_sql(sql, con=db) for index, row in papers.iterrows(): # id, main_link, direct_link = ids # direct_link = 'http://link.springer.com/article/10.1007/s10579-014-9282-3' id = row['id'] main_link = row['main_link'] # IEEE destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/' path = destination + str(id) + '.pdf' print(path) paper_id = (re.findall('\d+', main_link))[0] try: # path = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/4254.pdf' # paper_id = '7911954' url_pdf = 'wget "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s" -O %s' % ( paper_id, path) os.system(url_pdf) # os.system('wget "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s" -O %s') % (str(paper_id), path) # url_pdf = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id) # print(url_pdf) # ua = str(get_random_ua()) # # try: # response = requests.get( # url_pdf, # headers={ # 'User-Agent': ua # } # ) # except: # print("Connection refused") # time.sleep(5) # # # print(response.status_code) # if response.status_code == 200: # # content_type = response.headers.get('content-type') # # if 'application/pdf' in str(content_type): # destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/' # path = destination + str(id) + '.pdf' # # with open(path, 'wb') as f: # f.write(response.content) # sql = "update resolved_papers set downloaded = 1 where id = %s" % ( id) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: True. Saved!" % (id)) except: db.rollback() # time.sleep(randint(1, 30)) # # else: # print('Title with identifier %s not found' # % (id)) except: print( 'Failed to fetch citeseerx page with identifier %s due to request exception.' % (id)) time.sleep(randint(1, 6))
def _downloadSpringer(ids): db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, main_link, direct_link = ids # direct_link = 'http://link.springer.com/article/10.1007/s10579-014-9282-3' # SPRINGER # if 'article' in main_link: # https://link.springer.com/article/10.1007/s10579-014-9282-3 # https://link.springer.com/content/pdf/10.1007%2Fs10579-014-9282-3.pdf url_pdf = main_link.replace('article', 'content/pdf') + '.pdf' elif 'chapter' in main_link: # http://link.springer.com/chapter/10.1007/978-3-319-09846-3_4/fulltext.html # https://link.springer.com/content/pdf/10.1007%2F978-3-319-09846-3.pdf # direct_link = main_link.replace('/fulltext.html', '') url_pdf = main_link.replace('chapter', 'content/pdf') + '.pdf' # # IEEE # # paper_id = (re.findall('\d+', main_link))[0] # url_pdf = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id) # print(url_pdf) ua = str(get_random_ua()) try: response = requests.get(url_pdf, headers={'User-Agent': ua}) except: print("Connection refused") time.sleep(5) print(response.status_code) if response.status_code == 200: content_type = response.headers.get('content-type') if 'application/pdf' in str(content_type): destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/' path = destination + str(id) + '.pdf' with open(path, 'wb') as f: f.write(response.content) sql = "update resolved_papers set downloaded = 1 where id = %s" % ( id) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: True. Saved!" % (id)) except: db.rollback() else: print('Title with identifier %s not found' % (id)) except: print( 'Failed to fetch citeseerx page with identifier %s due to request exception.' % (id)) time.sleep(randint(1, 6))
def _download(ids): db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, query = ids params = urlencode({'q': query.lower()}, "UTF-8") url = SCHOLARS_BASE_URL + "/search?" + params print(url) ua = str(get_random_ua()) try: response = requests.get(url, headers={'User-Agent': ua}) except: print("Connection refused") time.sleep(5) print(response.status_code) if response.status_code == 200: data = response.text soup = BeautifulSoup(data, "html.parser") item = soup.find_all('div', {'class': 'result'})[0] if item: link = str(item.contents[1]).split('\n') title = "" title = re.sub('<[^<]+?>', '', link[2]) if query.lower() == title.lower(): # string = '/viewdoc/summary;jsessionid=4C1CD7E8F0D4A4E4BABAE601DE8D326F?doi=10.1.1.317.9673&rank=1' # suffix = re.sub(';.*\?', '?', string) # suffix = suffix.replace('summary', 'download').replace('&rank=1', '&rep=rep1&type=pdf') soup = BeautifulSoup(link[1]) a = soup.find("a", class_="remove doc_details") string = a.attrs['href'] suffix = re.sub(';.*\?', '?', string) suffix = suffix.replace('summary', 'download').replace( '&rank=1', '&rep=rep1&type=pdf') url_pdf = SCHOLARS_BASE_URL + suffix print(url_pdf) res = requests.get(url_pdf) content_type = res.headers.get('content-type') if 'application/pdf' in str(content_type): destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/' path = destination + str(id) + '.pdf' with open(path, 'wb') as f: f.write(res.content) sql = "update resolved_papers set downloaded = 1 where id = %s" % ( id) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: True. Saved!" % (id)) except: db.rollback() else: print('Title is not found with identifier %s' % (id)) except: print( 'Failed to fetch citeseerx page with identifier %s due to request exception.' % (id)) time.sleep(randint(1, 6))
import pymysql from base import db, cur from os import listdir from os.path import isfile, join files = [f for f in listdir('data/manual') if isfile(join('data/manual', f))] for file in files: name = file.split('.')[0] sql = "update resolved_papers set downloaded = 1 where id = %s" % (name) try: cur.execute(sql) db.commit() print("Id: %s. Updated!" % (name)) except: db.rollback()
def downloadPDFIEEE(ids): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: i, res_title, main_link, direct_link = ids p = False downloaded = "False" count = 0 # to get the title language # sql = 'select title_language from resolved_papers_title where id = %s;' % (i) # cur.execute(sql) # res_title = cur.fetchall()[0][0] if res_title and res_title == "en": # or not res_title: print(i) print(main_link) print(direct_link) # url = _getUrl(i, "direct_link") # toParse = direct_link # paper_id = (re.findall('\d+', toParse))[0] url = direct_link while downloaded == "False" and count < 2: count += 1 if count == 2: file = requests.get(url) open('/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/%s.pdf', 'wb').write(file.content) % (i) p = True else: if url: s = DownloadPDF() p = s.download(url, destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/', path=str(i) + '.pdf') if p == True: downloaded = "True" else: # url = _getUrl(i, "main_link") toParse = main_link paper_id = (re.findall('\d+', toParse))[0] # url = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id) url = 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=%s' % (paper_id) if downloaded == "True": # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i) sql = "update resolved_papers set downloaded = 1 where id = %s" % (i) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded)) except: db.rollback() else: print("Id: %s. Downloaded: %s." % (i, downloaded)) except UnicodeDecodeError: pass cur.close()
def downloadPDF(ids): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: i, res_title, main_link, direct_link = ids p = False downloaded = "False" count = 0 # to get the title language # sql = 'select title_language from resolved_papers_title where id = %s;' % (i) # cur.execute(sql) # res_title = cur.fetchall()[0][0] if res_title and res_title == "en": # or not res_title: print(i) print(main_link) print(direct_link) # url = _getUrl(i, "direct_link") url = direct_link while downloaded == "False" and count < 2: count += 1 if url: s = DownloadPDF() p = s.download(url, destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/', path=str(i) + '.pdf') if p == True: downloaded = "True" else: # url = _getUrl(i, "main_link") url = main_link if downloaded == "True": # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i) sql = "update resolved_papers set downloaded = 1 where id = %s" % (i) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded)) except: db.rollback() else: print("Id: %s. Downloaded: %s." % (i, downloaded)) except UnicodeDecodeError: pass cur.close()
def _filterTitle(papers): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, title = papers threshold = 1 # title = 'A New Approach for Cross-Language Plagiarism Analysis.'.lower() title = title.lower() k_dflanguage = 0 k_copy = 0 k_detection = 0 diff_language = ["Cross-language", "Crosslanguage", "Cross-lingual", "Crosslingual", "Cross-linguistic", "Crosslinguistic", "Multi-language", "Multilanguage", "Multi-lingual", "Multilingual", "Multi-linguistic", "Multilinguistic", "Machine-translation", ] copy = ["Copy", "Duplicate", "Plagiarism", ] detection = ["Detection", "Discovery", ] for row in diff_language: if row.lower() in title: k_dflanguage += 1 for row in copy: if row.lower() in title: k_copy += 1 for row in detection: if row.lower() in title: k_detection += 1 print("diff_language: %s." % (k_dflanguage)) print("copy: %s." % (k_copy)) print("detection: %s." % (k_detection)) if (k_dflanguage >= threshold or k_detection >= threshold or k_detection >= threshold): # papers_selected.append(id, title) sql = "insert into resolved_papers_selected_title values (%s)" % (id) print(sql) # try: cur.execute(sql) db.commit() # except: db.rollback() return True else: return False except: db.rollback() print('no saved') cur.close()
def downloadPDF(ids): # keep this connection in order to use multiprocessing db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: i, main_link, direct_link = ids p = False downloaded = "False" count = 0 print(i) print(main_link) print(direct_link) # url = _getUrl(i, "direct_link") url = direct_link while downloaded == "False" and count < 2: count += 1 if url: # url = 'https://s3.amazonaws.com/academia.edu/download/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32' # https://s3.amazonaws.com/academia.edu.documents/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32 url = "http://www.academia.edu/download/30761819/book.pdf#page=32" # http://www.academia.edu/download/30761819/book.pdf#page=32 # url = 'http://google.com' i = 149 destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/' path = destination + str(i) + '.pdf' try: ua = UserAgent() headers = {'User-Agent': str(ua.random)} r = requests.head( 'http://www.academia.edu/download/30761819/book.pdf#page=32', allow_redirects=True) print(r.url) s = requests.session() res = s.get(url, headers=headers, allow_redirects=False) print(res.url) # print(finalurl) p = urlretrieve(url, path) if p[1].get_content_type() == 'application/pdf': downloaded = "True" except: pass else: # url = _getUrl(i, "main_link") url = main_link if downloaded == "True": # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i) sql = "update resolved_papers set downloaded = 1 where id = %s" % ( i) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded)) except: db.rollback() else: print("Id: %s. Downloaded: %s." % (i, downloaded)) except UnicodeDecodeError: pass cur.close()
def _countOccurencies(id, type): keywords = [ "Cross-language", "Crosslanguage", "Cross-lingual", "Crosslingual", "Cross-linguistic", "Crosslinguistic", "Multi-language", "Multilanguage", "Multi-lingual", "Multilingual", "Multi-linguistic", "Multilinguistic", "Machine-translation", "Copy", "Duplicate", "Plagiarism", "Detection", "Discovery" ] nkeywords = len(keywords) text = "" # with open(os.path.join('data/txt', str(id) + '_head.txt')) as infile: with open( os.path.join('/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/txt', str(id) + '_head.txt')) as infile: for line in infile: line = _processText(line) text += line words = _processNL(text) fdist = nltk.FreqDist(words) i = 0 head = False while i < nkeywords: if fdist[str(keywords[i]).lower()] > 0: sql = "insert into resolved_papers_occurrenciesv4 values (%s, '%s', '%s', '%s', %s);" % ( id, type, "head", str(keywords[i]).lower(), fdist[str( keywords[i]).lower()]) # print (sql) head = True try: cur.execute(sql) db.commit() except: db.rollback() i += 1 #### tail text = "" # with open(os.path.join('data/txt', str(id) + '_tail_noreferences.txt')) as infile: with open( os.path.join('/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/txt', str(id) + '_tail_noreferences.txt')) as infile: for line in infile: line = _processText(line) text += line words = _processNL(text) fdist = nltk.FreqDist(words) i = 0 tail = False while i < nkeywords: if fdist[str(keywords[i]).lower()] > 0: sql = "insert into resolved_papers_occurrenciesv4 values (%s, '%s', '%s', '%s', %s);" % ( id, type, "tail", str(keywords[i]).lower(), fdist[str( keywords[i]).lower()]) # print (sql) tail = True try: cur.execute(sql) db.commit() except: db.rollback() i += 1 return ('Done', head, tail)