def parse_collaboration_info(info): collas = info.split(',') pub = {} for col in collas: if len(col) > 0: if col in pub: pub[col] += 1 else: pub[col] = 1 res = [] for k in pub: res.append([k, pub[k]]) res = sort(res) if len(res) > 6: res = res[0:6] for i in range(len(res)): affid = res[i][0] try: Cursor.execute( 'select name from affiliation where id = {};'.format(affid)) res[i][0] = Cursor.fetchone()[0] res[i] = { "affiliation_id": affid, "affiliation_name": res[i][0], "collaboration_count": res[i][1] } except Exception as e: traceback.print_exc() print(e) return res
def _load_affiliation_ids(self): sql = '''SELECT id FROM affiliation''' Cursor.execute(sql) raw_result = list(Cursor.fetchall()) self.affiliation_ids = [i[0] for i in raw_result] print("{} affiliation_total_count: {}".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), len(self.affiliation_ids)))
def update_author_collaboration_job(): sql = 'SELECT id FROM author' Cursor.execute(sql) author_list = list(map(lambda x: x[0], list(Cursor.fetchall()))) update_sql = ''' INSERT INTO author_collaboration(start_id,end_id,distance,predict_collaboration) VALUES (%s,%s,%s,%s) ''' author_collaboration_list = [] wfile = open( "/Users/Karl/Desktop/SoftwareExercise/authorCollaboration.txt", "a+", encoding="utf-8") for authors in chunks(author_list, 500): start_time = time.time() for author in authors: with Neo4jDriver.session() as session: res = session.read_transaction(searchCoAuthor, author) # data = [] # for record in res: # data.append(record["authorId"]) # author_collaboration_dict[author]=data # wfile.write(json.dumps(author_collaboration_dict, indent=4)) # author_collaboration_dict.clear() # end_time = time.time() # duration = end_time - start_time # print('update_author_collaboration_job runtime is:{0:.3f}s'.format(duration)) # wfile.close() for coAuthor in res: jaccrdDistance = computeJaccrdDistance( author, coAuthor["authorId"]) print((author, coAuthor["authorId"], round(jaccrdDistance[0], 2), json.dumps(jaccrdDistance[1]))) author_collaboration_list.append( (author, coAuthor["authorId"], round(jaccrdDistance[0], 2), json.dumps(jaccrdDistance[1]))) try: Cursor.executemany(update_sql, author_collaboration_list) Connection.commit() except Exception as e: print(e) Connection.rollback() end_time = time.time() duration = end_time - start_time print('update_author_collaboration_job 500 runtime is:{0:.3f}s'.format( duration)) time.sleep(1)
def update_one_affiliation_year_count(id, pipe): try: Cursor.execute(''' select group_concat(concat(year(art.date)) separator ",") from article art, affiliation_article afar where afar.article_id = art.id and afar.affiliation_id = ''' + id + ";") raw_result = Cursor.fetchone()[0] if raw_result: raw_result = raw_result.strip() res = json.dumps(parseInfo(raw_result)) pipe.set(cache_const.AFFILIATION_YEAR_COUNT.format(id), res) except Exception as e: traceback.print_exc() print(e)
def update_one_affiliation_collaboration(id, pipe): sql = ''' select group_concat(aff2.affiliation_id) from affiliation_article aff1, affiliation_article aff2 where aff1.affiliation_id <> aff2.affiliation_id and aff1.article_id = aff2.article_id and aff1.affiliation_id={} '''.format(id) try: Cursor.execute(sql) inf = Cursor.fetchone()[0] if inf and len(inf) > 0: res = parse_collaboration_info(inf) pipe.set(AFFILIATION_COLLABORATION_PUBLICATION_COUNT.format(id), json.dumps(res)) except Exception as e: traceback.print_exc() print(e)
def update_affiliation_keyword_job(): affiliation = AffiliationLoader() affiliation.get_affiliation_data() related_article_list = sorted(affiliation.related_article_dict.items(), key=lambda x: x[0], reverse=False) related_keyword_dict = {} sql = '''SELECT keyword_id,keyword_desc,COUNT(article_id)AS num FROM keyword_article WHERE article_id IN %s GROUP BY keyword_id,keyword_desc ORDER BY num DESC''' for affiliations_articles in chunks(related_article_list, 500): related_dict = {} for affiliation_articles in affiliations_articles: affiliation_id = affiliation_articles[0] articles = affiliation_articles[1] #机构没有对应的文章 if not articles or len(articles) == 0: continue Cursor.execute(sql, (articles, )) raw_result = list(Cursor.fetchall()) if raw_result is None: continue keywords = list(map(parseKeyword, raw_result)) related_dict[affiliation_id] = keywords related_keyword_dict.update(related_dict) print("{} related_keyword_dict_len: {}".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), len(related_keyword_dict))) pipeline = RedisTemplate.pipeline() for articles in chunks(related_article_list, 500): for article in articles: article_key = AFFILIATION_RELATED_KEYWORD_KEY_TEMPLATE.format( article[0]) keywords = related_keyword_dict.get(article[0]) if keywords: pipeline.set(article_key, json.dumps(keywords)) pipeline.execute() time.sleep(1) print("{} update_affiliation_keyword_job".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
def update_affiliation_new_article_job(): affiliation = AffiliationLoader() affiliation.get_affiliation_data() related_article_list = sorted(affiliation.related_article_dict.items(),key=lambda x:x[0],reverse=False) related_new_article_dict = {} sql = '''SELECT id FROM article WHERE id IN %s ORDER BY date DESC LIMIT 1''' for affiliations_articles in chunks(related_article_list,500): related_dict = {} for affiliation_articles in affiliations_articles: affiliation_id = affiliation_articles[0] articles = affiliation_articles[1] #机构没有对应的文章 if not articles or len(articles)==0: continue Cursor.execute(sql, (articles,)) raw_result = list(Cursor.fetchone()) if raw_result is None: continue article_id = raw_result[0] related_dict[affiliation_id] = article_id related_new_article_dict.update(related_dict) print("{} related_new_article_dict_len: {}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), len(related_new_article_dict))) pipeline = RedisTemplate.pipeline() for articles in chunks(related_article_list,500): for article in articles: article_key = AFFILIATION_RELATED_NEW_ARTICLE_ID_KEY_TEMPLATE.format(article[0]) new_article_id = related_new_article_dict.get(article[0]) if new_article_id: pipeline.set(article_key,json.dumps(new_article_id)) pipeline.execute() time.sleep(1) print("{} update_affiliation_new_article_job finished".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
def _load_related_article_dict(self): for ids in chunks(self.affiliation_ids, 500): sql = ''' SELECT affiliation_id, group_concat(article_id) as article_ids FROM affiliation_article WHERE affiliation_id IN %s GROUP BY affiliation_id ''' Cursor.execute(sql, (ids, )) raw_result = list(Cursor.fetchall()) related_dict = {} for info in raw_result: if info is None: continue if info[1] is None or len(info) == 0: self.related_article_dict[info[0]] = [] else: related_dict[info[0]] = info[1].split(',') self.related_article_dict.update(related_dict) time.sleep(1) print("{} related_article_dict_len: {}".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), len(self.related_article_dict)))
def computeAffiliationDistance(start, end): intersection_sql = ''' SELECT COUNT(DISTINCT a1.affiliation_id) FROM acmieee.affiliation_author a1,acmieee.affiliation_author a2 WHERE a1.author_id = %s AND a2.author_id = %s AND a1.affiliation_id = a2.affiliation_id ''' union_sql = ''' SELECT COUNT(DISTINCT affiliation_id) FROM acmieee.affiliation_author WHERE author_id = %s OR author_id = %s ''' Cursor.execute(intersection_sql, (str(start), str(end))) intersection_num = Cursor.fetchone()[0] Cursor.execute(union_sql, (str(start), str(end))) union_num = Cursor.fetchone()[0] if union_num == 0: return 1 return 1 - intersection_num / union_num
def computeDirectionDistance(start, end): intersection_sql = ''' SELECT group_concat(DISTINCT a1.keyword_desc separator "\t") FROM acmieee.keyword_author a1,acmieee.keyword_author a2 WHERE a1.author_id = %s AND a2.author_id = %s AND a1.keyword_id = a2.keyword_id ''' union_sql = ''' SELECT COUNT(DISTINCT keyword_id) FROM acmieee.keyword_author WHERE author_id = %s OR author_id = %s ''' Cursor.execute(intersection_sql, (str(start), str(end))) intersection = Cursor.fetchone()[0] intersection_num = 0 predictDirections = [] if intersection: predictDirections = intersection.split("\t") intersection_num = len(predictDirections) Cursor.execute(union_sql, (str(start), str(end))) union_num = Cursor.fetchone()[0] if union_num == 0: return 1 return [1 - intersection_num / union_num, predictDirections]
def update_affiliation_database_job(): affiliation = AffiliationLoader() affiliation.get_affiliation_data() related_article_list = sorted(affiliation.related_article_dict.items(), key=lambda x: x[0], reverse=False) affiliation_info_list = [] sql = ''' SELECT aff.name,AVG(art.citation_count),SUM(art.citation_count), COUNT(art.id),MIN(YEAR(art.date)),MAX(YEAR(art.date)), COUNT(art.pdf_link),AVG(art.total_usage-art.citation_count) FROM article art,affiliation aff WHERE art.id IN %s AND aff.id = %s ''' back_up_sql = ''' SELECT aff.name FROM affiliation aff WHERE aff.id = %s ''' update_sql = ''' INSERT INTO affiliation_info (affiliation_id,affiliation_name,average_citation_per_article, citation_count,publication_count,start_year,end_year, available_download,average_download_per_article, create_time,update_time) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE affiliation_name = VALUES (affiliation_name), average_citation_per_article = VALUES (average_citation_per_article), citation_count = VALUES (citation_count), publication_count = VALUES(publication_count), start_year = VALUES (start_year), end_year = VALUES (end_year), available_download = VALUES (available_download), average_download_per_article = VALUES (average_download_per_article), update_time = VALUES (update_time) ''' for affiliations_articles in chunks(related_article_list, 500): for affiliation_articles in affiliations_articles: affiliation_id = affiliation_articles[0] articles = affiliation_articles[1] update_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if not articles or len(articles) == 0: Cursor.execute(back_up_sql, (affiliation_id, )) affiliation_name = raw_result[0] affiliation_info_list.append( (affiliation_id, affiliation_name, 0.0, 0, 0, -1, -1, 0, 0.0, update_time, update_time)) continue Cursor.execute(sql, ( articles, affiliation_id, )) raw_result = list(Cursor.fetchone()) if raw_result is None: continue affiliation_name = raw_result[0] average_citation_per_article = float( str(raw_result[1].quantize(Decimal('0.00')))) citation_count = int(str(raw_result[2])) publication_count = raw_result[3] start_year = raw_result[4] end_year = raw_result[5] available_download = raw_result[6] average_download_per_article = float( str(raw_result[7].quantize(Decimal('0.00')))) affiliation_info_list.append( (affiliation_id, affiliation_name, average_citation_per_article, citation_count, publication_count, start_year, end_year, available_download, average_download_per_article, update_time, update_time)) print("{} affiliation_info_list_len: {}".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), len(affiliation_info_list))) for affiliation_infos in chunks(affiliation_info_list, 500): try: Cursor.executemany(update_sql, affiliation_infos) Connection.commit() except Exception as e: print(e) Connection.rollback() time.sleep(1) print("{} update_affiliation_database_job finished".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))