def save(self, gr_id, gr_info): # Generate pseudo-key-id __g_id = "g_" + str(gr_id) # Check if group exists at db if len(self.rd_instance_us.keys(__g_id)) == 0: # Save group and mark to active gr_info["state"] = "active" self.rd_instance_us.hmset("g_" + str(gr_id), gr_info) # Print alert if config.DEBUGGER: config.print_message("- Added Group: %d" % int(gr_id)) # Group exists at redis else: # Get info at redis gr_rd = self.rd_instance_us.hgetall(__g_id) # Detect different information from two groups __new_group = st_diff.groups(gr_info, gr_rd) if __new_group is not None: # Generate new group self.rd_instance_us.hmset(__g_id, gr_info) # Print alert if config.DEBUGGER: config.print_message("- Updated Group: %d" % int(gr_id))
def save(self, us_id, us_info): # Generate pseudo-key-id __u_id = "u_" + str(us_id) # Check if user exists at non gitlab users if len(self.rd_instance_us.keys(__u_id)) == 0: # Save user and mark to active us_info["state"] = "active" self.rd_instance_us.hmset("u_" + str(us_id), us_info) # Print alert if config.DEBUGGER: config.print_message("- Added User: %d" % int(us_id)) # User exists at redis else: # Get info at redis us_rd = self.rd_instance_us.hgetall(__u_id) # Detect different information from two users __new_user = st_diff.users(us_info, us_rd) if __new_user is not None: __new_user = us_info # Generate new user self.rd_instance_us.hmset(__u_id, __new_user) # Print alert if config.DEBUGGER: config.print_message("- Updated User: %d" % int(us_id))
def update_information(self, update): __mt_gl = sniff.get_keys_and_values_from_gitlab(self, update) __mt_rd_id = sniff.get_keys_from_redis(self, update) __mt_gl_id = __mt_gl.keys() # Generate difference and intersection metadata __mt_diff = set(__mt_gl_id).difference(set(__mt_rd_id)) __mt_int = set(__mt_gl_id).intersection(set(__mt_rd_id)) __mt_mod = list(__mt_diff.union(__mt_int)) __mt_del = list(set(__mt_rd_id).difference(set(__mt_gl_id))) # Print alert if config.DEBUGGER: config.print_message("- [ %s ] New or possible updates: %d | Deleted: %d" % (update, len(__mt_mod), len(__mt_del))) # Insert / Modify Information for i in __mt_mod: if update == "users": util_user.save(self, i, __mt_gl[i]) elif update == "groups": util_group.save(self, i, __mt_gl[i]) elif update == "projects": util_project.save(self, i, __mt_gl[i]) # Delete Information for i in __mt_del: if update == "users": util_user.delete(self, i) elif update == "groups": util_group.delete(self, i) elif update == "projects": util_project.delete(self, i)
def generate_pycode(): settings.print_message(" - Generating Python code ... ") f = open("generated/template.tmp", "r") t_file = f.read() f.close() f = open("generated/glapi.py", "w") f.write(t_file) f.close()
def generate_doc(branch): file_path = settings.GEN_DOC_DISK_PATH if os.path.exists(file_path): if not os.path.isdir(file_path): os.remove(file_path) else: shutil.rmtree(file_path, True) settings.print_message(" - Generating branch: %s." % branch) call(["./gitlab-docs/generate.rb"])
def generate_pypi_settings(): settings.print_message(" - Generating pypi config ... ") f = open("generated/pypi.tmp", "r") p_settings = f.read() f.close() p_settings = p_settings.replace("PYPI_USERNAME", settings.PYPI_USER) p_settings = p_settings.replace("PYPI_PASSWORD", settings.PYPI_PASS) f = open(os.path.join(os.path.expanduser("~"), ".pypirc"), "w") f.write(p_settings) f.close()
def generate_settings(version): __version = str(version).replace("-", ".") settings.print_message(" - Generating settings.py: %s ... " % __version) f = open("generated/settings.tmp", "r") t_settings = f.read() f.close() t_settings = t_settings.replace("API_VERSION_TEMPLATE", __version + ".8") f = open("generated/settings.py", "w") f.write(t_settings) f.close()
def _search_scholar_soup(soup, max_papers_count, total_papers, start_paper, skip_endnote=False, print_level=0): """Generator that returns pub information dictionaries from the search page""" page_num = 1 counter = 0 while True: paper_blocks = soup.find_all('div', class_=lambda css_class: \ ("gs_r" in css_class and "gs_or" in css_class) if css_class else False) page_total = len(paper_blocks) logger.debug( "Find papers on page #{0} (google_max_papers = {1})".format( page_num, max_papers_count)) logger.debug("Total %i papers on page." % (page_total)) for page_counter, paper in enumerate(paper_blocks): if counter >= max_papers_count: break counter += 1 if print_level >= 0: settings.print_message( "Process paper #{} (total {})".format( counter, total_papers), print_level) logger.debug("Process paper #{} (total {})".format( counter, total_papers)) logger.debug( "Parse html and get info about paper #{0} on searching page (total {1} on page)" .format(page_counter + 1, page_total)) yield _get_info_from_resulting_selection(paper, skip_endnote, print_level) if soup.find(class_='gs_ico gs_ico_nav_next' ) and counter < max_papers_count: url = soup.find( class_='gs_ico gs_ico_nav_next').parent['href'].strip() result = True soup = None logger.debug("Load next page in resulting query selection.") while result and soup is None: soup = utils.get_soup(_FULLURL.format(_HOST, url)) if soup is None: result = None # while result is None: # result = input('Do not load new page on scholar. Try again? [Y/N]').lower() # if result == "y": result = True # elif result == "n": result = False if soup is None: logger.debug( "Soup from google.scholar is None. Break from paper generator loop." ) break page_num += 1 else: break
def save(self, pr_id, pr_info): # Save project at fs if it is necessary save_fs(pr_info) # Generate pseudo-key-id __p_id = "p_" + str(pr_id) # Get project's owner from metadata if pr_info.get("owner") is None: pr_info["owner"] = "g_" + str(pr_info.get("namespace").get("id")) else: pr_info["owner"] = "u_" + str(pr_info.get("owner").get("id")) del pr_info["namespace"] # Get project's tags from Gitlab API pr_info['tags'] = map( lambda x: x.get("name").encode("ascii", "ignore"), self.gl_instance.get_projects_repository_tags_byId(id=pr_id) ) # Generate state (boolean) pr_info['state'] = 'archived' if pr_info['archived'] == 'true' else 'active' del pr_info['archived'] # Check if project exists at db if len(self.rd_instance_pr.keys(__p_id)) == 0: # Save project self.rd_instance_pr.hmset(__p_id, pr_info) # Print alert if config.DEBUGGER: config.print_message("- Added Project: %d" % int(pr_id)) # Project exists at redis else: # Get info at redis pr_rd = self.rd_instance_pr.hgetall(__p_id) # Detect different information from two projects __new_project = st_diff.projects(pr_info, pr_rd) if __new_project is not None: # Generate new project self.rd_instance_pr.hmset(__p_id, pr_info) # Print alert if config.DEBUGGER: config.print_message("- Updated Project: %d" % int(pr_id)) # Project has changes at branches, commits, metadata ... save_code(self, pr_id, pr_info.get("name"))
def save(self, pr_id, pr_info): # Save project at fs if it is necessary save_fs(pr_info) # Generate pseudo-key-id __p_id = "p_" + str(pr_id) # Get project's owner from metadata if pr_info.get("owner") is None: pr_info["owner"] = "g_" + str(pr_info.get("namespace").get("id")) else: pr_info["owner"] = "u_" + str(pr_info.get("owner").get("id")) del pr_info["namespace"] # Get project's tags from Gitlab API pr_info['tags'] = map( lambda x: x.get("name").encode("ascii", "ignore"), self.gl_instance.get_projects_repository_tags_byId(id=pr_id)) # Generate state (boolean) pr_info[ 'state'] = 'archived' if pr_info['archived'] == 'true' else 'active' del pr_info['archived'] # Check if project exists at db if len(self.rd_instance_pr.keys(__p_id)) == 0: # Save project self.rd_instance_pr.hmset(__p_id, pr_info) # Print alert if config.DEBUGGER: config.print_message("- Added Project: %d" % int(pr_id)) # Project exists at redis else: # Get info at redis pr_rd = self.rd_instance_pr.hgetall(__p_id) # Detect different information from two projects __new_project = st_diff.projects(pr_info, pr_rd) if __new_project is not None: # Generate new project self.rd_instance_pr.hmset(__p_id, pr_info) # Print alert if config.DEBUGGER: config.print_message("- Updated Project: %d" % int(pr_id)) # Project has changes at branches, commits, metadata ... save_code(self, pr_id, pr_info.get("name"))
def generate_meta_code(file_dir): md = {} settings.print_message(" - Generating metadata from html docs ... ") for i in os.listdir(file_dir): gen_code = generate_code_from_file(i, file_dir + "/" + i) for j in gen_code: if j in md.keys(): settings.print_message(" * Duplicated at [" + i + "]: " + md[j].get("string")) else: md[j] = gen_code[j] return md
def get_pdf(rg_paper_id, filename): """Load pdf for paper with rg_paper_id and save to file filename""" url = get_pdf_url(rg_paper_id) if url is None: return False try: settings.print_message("\tDownload pdf...") return utils.download_file(url, filename) except BaseException: logger.warn(traceback.format_exc()) return False return True
def processFullDocument(pdf_file_name): """ Get info from header PDF """ settings.print_message("Send to grobid service.", 2) data = get_data_from_grobid(GROBID_PROCESSED_FULL_TEXT_COMMAND, open(pdf_file_name, 'rb')) settings.print_message("Check data.", 2) logger.debug("Check data.") if not data: logger.debug( "Server returned empty response (File processing failed), skip.") return None logger.debug("Convert completed!") return data
def delete(self, us_id): # Generate pseudo-key-id __u_id = "u_" + str(us_id) # Check user exists if len(self.rd_instance_us.keys(__u_id)) > 0: # Remove from db self.rd_instance_us.delete(__u_id) # Print alert if config.DEBUGGER: config.print_message("- Removed User %d" % int(us_id))
def get_pdf(DOI, filename): """Load pdf for paper with DOI and save to file filename""" url = get_pdf_url(DOI) if url is None: return False try: settings.print_message("Download pdf...", 2) utils.download_file(url, filename) return utils.check_pdf(filename) except BaseException: logger.warn(traceback.format_exc()) # return False raise return True
def delete(self, gr_id): # Generate pseudo-key-id __g_id = "g_" + str(gr_id) # Check if group exists if len(self.rd_instance_us.keys(__g_id)) > 0: # Set flag to deleted self.rd_instance_us.delete(__g_id) # Print alert if config.DEBUGGER: config.print_message("- Removed Group: %d" % int(gr_id))
def save_fs(pr_info): # Create folder to allocate all repositories if it does not exist if not os.path.exists(config.COLLECTOR_GIT_FOLDER): os.makedirs(config.COLLECTOR_GIT_FOLDER) # Save (temp) current directory cur_dir = os.getcwd() # Generate pseudo-name-id and get url __pr_id = str(pr_info.get("id")) + "_" + pr_info.get("name") __pr_url = pr_info.get("http_url_to_repo") # Insert credentials HTTP/S __replace = "http://" if str(__pr_url).startswith("https://"): __replace = "https://" __pr_url = str(__pr_url).replace( __replace, __replace + config.GITLAB_USER + ":" + config.GITLAB_PASS + "@") # Change current directory to folder os.chdir(config.COLLECTOR_GIT_FOLDER) # Check repository does not exist if not os.path.exists(__pr_id): # Clone (mirror like bare repository) commands.getstatusoutput("git clone --mirror " + __pr_url + " " + __pr_id) # Print alert if config.DEBUGGER: config.print_message("- Cloned Project: " + pr_info.get("name")) # Repository exists else: # Change current directory to repository os.chdir(config.COLLECTOR_GIT_FOLDER + __pr_id) # Clone (mirror like bare repository) commands.getstatusoutput("git pull " + __pr_url) # Print alert if config.DEBUGGER: config.print_message("- Pulled Project: " + pr_info.get("name")) # Revert current directory os.chdir(cur_dir)
def save_fs(pr_info): # Create folder to allocate all repositories if it does not exist if not os.path.exists(config.COLLECTOR_GIT_FOLDER): os.makedirs(config.COLLECTOR_GIT_FOLDER) # Save (temp) current directory cur_dir = os.getcwd() # Generate pseudo-name-id and get url __pr_id = str(pr_info.get("id")) + "_" + pr_info.get("name") __pr_url = pr_info.get("http_url_to_repo") # Insert credentials HTTP/S __replace = "http://" if str(__pr_url).startswith("https://"): __replace = "https://" __pr_url = str(__pr_url).replace( __replace, __replace + config.GITLAB_USER + ":" + config.GITLAB_PASS + "@" ) # Change current directory to folder os.chdir(config.COLLECTOR_GIT_FOLDER) # Check repository does not exist if not os.path.exists(__pr_id): # Clone (mirror like bare repository) commands.getstatusoutput("git clone --mirror " + __pr_url + " " + __pr_id) # Print alert if config.DEBUGGER: config.print_message("- Cloned Project: " + pr_info.get("name")) # Repository exists else: # Change current directory to repository os.chdir(config.COLLECTOR_GIT_FOLDER + __pr_id) # Clone (mirror like bare repository) commands.getstatusoutput("git pull " + __pr_url) # Print alert if config.DEBUGGER: config.print_message("- Pulled Project: " + pr_info.get("name")) # Revert current directory os.chdir(cur_dir)
def get_request(url): """Send get request & return data""" while(True): resp = None try: resp = _SESSION.get(url) if resp.status_code != 200: settings.print_message("HTTP Error #{0}. {1}.".format(resp.status_code, resp.reason)) return None return resp.content except Exception as error: logger.warn(traceback.format_exc()) settings.print_message(error) if input("Try load again? [y/n]: ") == 'y': continue return None return None
def get_pdf(url, filename): """Load pdf for paper with DOI and save to file filename""" settings.print_message("PDF-file found in google scholar.", 2) if url is None: return None try: settings.print_message("Download pdf...", 2) utils.download_file(url, filename) return utils.check_pdf(filename) except KeyboardInterrupt: raise except BaseException: logger.warn(traceback.format_exc()) # return False raise return 0
def get_pdf(QUESTION, filename): """Load pdf for paper with QUESTION and save to file filename""" if not QUESTION: return None url = get_pdf_url(QUESTION) if url is None: return None try: settings.print_message( "Download pdf from Sci-Hub by '{}'".format(QUESTION), 2) utils.download_file(url, filename) return utils.check_pdf(filename) except KeyboardInterrupt: raise except BaseException: logger.warn(traceback.format_exc()) # return False raise return 0
def delete(self, pr_id): # Generate pseudo-key-id __p_id = "p_" + str(pr_id) # Check project exists if len(self.rd_instance_pr.keys(__p_id)) > 0: # Get Info about project __pr_info = self.rd_instance_pr.hgetall(__p_id) # Move folder to deleted folder delete_fs(__pr_info) # Set flag to deleted self.rd_instance_pr.hset(__p_id, "state", "deleted") # Print alert if config.DEBUGGER: config.print_message("- Removed Project %d " % int(pr_id))
def processReferencesDocument(pdf_file_name): """ Get references from article PDF """ settings.print_message("Send to grobid service..", 2) data = get_data_from_grobid(GROBID_PROCESSED_REFERENCES_COMMAND, open(pdf_file_name, 'rb')) settings.print_message("Check data", 2) logger.debug("Check data") if not data: logger.debug( "Server returned empty response (File processing failed), skip.") return None settings.print_message("Processing TEI data", 2) logger.debug("Convert tei to dictionary") dictData = tei2dict.tei_to_dict(data) logger.debug("Convert completed: {}".format(json.dumps(dictData))) if not dictData["references"]: logger.debug("References are not available, skip") return None return dictData["references"]
def processHeaderDocument(pdf_file_name): """ Get info from header PDF """ settings.print_message("Send to grobid service.", 2) data = get_data_from_grobid(GROBID_PROCESSED_HEADER_COMMAND, open(pdf_file_name, 'rb')) settings.print_message("Check data.", 2) logger.debug("Check data.") if not data: logger.debug( "Server returned empty response (File processing failed), skip.") return None settings.print_message("Processing TEI data.", 2) logger.debug("Convert tei to dictionary.") dictData = tei2dict.tei_to_dict(data) logger.debug("Convert completed: {}".format(json.dumps(dictData))) authors = set(dictData["authors"]) if dictData["authors"] else [] msg = "RESULT: has title:{:^3}has date:{:^3}has DOI:{:^3}has abstract:{:^3}authors:{:^4}has start page:{:^3}has end page:{:^3}has publisher:{:^3}".format( dictData["title"] is not None, dictData["pubdate"] is not None, dictData["DOI"] is not None, dictData["abstract"] is not None, len(authors), dictData["start_page"] is not None, dictData["end_page"] is not None, dictData["publisher"] is not None) dictData["abstract_ru"] = None logger.debug(msg) return dictData
def save_code(self, pr_id, pr_name): # Generate pseudo-key-id __p_id = "p_" + str(pr_id) # Generate metadata from gitlab __branches_gl_info = {} __branches = self.gl_instance.get_projects_repository_branches_byId(id=pr_id) [__branches_gl_info.update({ x.get("name"): x })for x in __branches] # Generate metadata from redis __branches_rd_info = {} __branches = self.rd_instance_br.keys(__p_id + ":*") [__branches_rd_info.update({ base64.b16decode(x.split(":")[1]): self.rd_instance_br.hgetall(x) }) for x in __branches] # Generate difference and intersection metadata __mt_diff = set(__branches_gl_info.keys()).difference(set(__branches_rd_info.keys())) __mt_int = set(__branches_gl_info.keys()).intersection(set(__branches_rd_info.keys())) __mt_mod = list(__mt_diff.union(__mt_int)) __mt_del = list(set(__branches_rd_info.keys()).difference(set(__branches_gl_info.keys()))) # Structure for removed commits __mt_del_commits = set() # Delete information about Branch count = 0 for i in __mt_del: # Number of deleted branches count += 1 # Print alert if config.DEBUGGER: config.print_message( "* (%d) [%d/%d] Deleted %s" %(int(pr_id), count, len(__mt_del), i) ) # Get information from redis __br_info = __branches_rd_info[i] # Generate pseudo-key-id and remove info __br_id = __p_id + ":" + __br_info.get("id") self.rd_instance_br.delete(__br_id) self.rd_instance_br_co.delete(__br_id) # Remove links with contributors __br_con = eval(__br_info.get("contributors")) for j in __br_con: __us_com = self.rd_instance_us_co.smembers(j) for x in __us_com: if str(x).startswith(__br_id): __mt_del_commits.add(str(x).split(":")[0] + ":" + str(x).split(":")[2]) self.rd_instance_us_co.srem(j, x) # Remove all unique commits if len(__mt_del_commits) > 0: __rd_branch_co = set() __rd_branch = self.rd_instance_br.keys(__p_id + "*") for i in __rd_branch: __rd_branch_co = __rd_branch_co.union( set(dict(self.rd_instance_br_co.zrange(i, 0, -1)).keys()) ) for i in __mt_del_commits: if i not in __rd_branch_co: self.rd_instance_co.delete(i) # Update information about Branch count = 0 for i in __mt_mod: # Number of reviewed branches count += 1 # Print alert if config.DEBUGGER: config.print_message( "* (%d) [%d/%d] Reviewed %s" %(int(pr_id), count, len(__mt_mod), i) ) # Clean information __br_info = __branches_gl_info[i] st_clean.branch(__br_info) # Generate pseudo-key-id __br_id = __p_id + ":" + __br_info.get("id") # Save / Replace information at redis self.rd_instance_br.hmset(__br_id, __br_info) # Update information about branch's commits util_commit.update(self, pr_id, pr_name, i)
def get_friends_graph(): """ Get friends for users and create social graph. """ total_users = 0 count_bad_users = 0 count_graph_users = 0 save_counter = settings.SAVE_COUNT result = "successful" settings.print_message( "Create VK session for application (app_id={})".format( settings.VK_APPLICATION_ID)) logger.debug("Create VK session for application (app_id={}).".format( settings.VK_APPLICATION_ID)) try: session = vk.Session(access_token=settings.VK_ACCESS_TOKEN) api = vk.API(session) except Exception as error: logger.warn(traceback.format_exc()) settings.print_message( "Can't create VK session for application with app_id={}".format( settings.VK_APPLICATION_ID)) return ("with error.", 0, 0, 0) # logger.debug("".format()) graph = yEdGraph.Graph() level_queue = [ int(settings.PARAMS["user_id"]), ] count_levels = int(settings.PARAMS["levels"]) all_level_ids = [list() for _ in range(count_levels)] for step in range(count_levels): level_counter = len(level_queue) settings.print_message("Process level #{} (total users {})".format( step, level_counter)) logger.debug("Process level #{} (total users {})".format( step, level_counter)) for user_index in range(level_counter): id = level_queue[user_index] total_users += 1 save_counter -= 1 if save_counter <= 0: save_counter = settings.SAVE_COUNT logger.debug("Create graphml for graph.") graph.construct_graphml() logger.debug("Save graphml in backup file backup_{}.".format( settings.OUTPUT_FILE)) try: with open("backup_{}".format(settings.OUTPUT_FILE), "w", encoding=settings.OUTPUT_ENCODING) as f: f.write(graph.get_graph()) except Exception as error: logger.warn(traceback.format_exc()) logger.warn("Can not create backup file.") settings.print_message( "Process id {}. User #{} on level #{} (total {})".format( id, user_index, step, level_counter), 2) logger.debug( "Process id {}. User #{} on level #{} (total {}).".format( id, user_index, step, level_counter)) try: settings.print_message("Add user node in graph.", 3) logger.debug("Check user (id={}) in graph".format(id)) if not id in graph.nodes.keys(): logger.debug( "Create user node in graph (id={}).".format(id)) logger.debug("Get info for user (id={}).".format(id)) try: user_info = api.users.get( user_ids=[id], fields= "nickname, sex, bdate, city, country, photo_200_orig, photo_200, photo_100" ) if not user_info: raise Exception("User info is empty.") except Exception as error: logger.warn(traceback.format_exc()) logger.debug( "Can not get info for user (id={}), skip.".format( id)) settings.print_message( "Can not get info for user, skip.", 3) count_bad_users += 1 continue user_info = user_info[0] logger.debug("User info='{}'.".format( json.dumps(user_info))) logger.debug("Load user photo (id={}).".format(id)) photo = utils.get_request( user_info[settings.VK_PHOTO_1 if settings.VK_PHOTO_1 in user_info else settings.VK_PHOTO_2]) if not photo: logger.debug( "Can't loading user photo (id={}).".format(id)) info_label = "ФИГ: {} {} \nНик: {} \nID: {} \nПол: {} \nДата рождения: {} \nГород: {} \nСтрана: {}".format( user_info["first_name"] if "first_name" in user_info else "----", user_info["last_name"] if "last_name" in user_info else "----", user_info["nickname"] if "nickname" in user_info else "----", id, utils.SEX[user_info["sex"]] if "sex" in user_info else "----", user_info["bdate"] if "bdate" in user_info else "----", user_info["city"] if "city" in user_info else "----", user_info["country"] if "country" in user_info else "----") graph.add_node(id, check_existance=False, label=info_label, shape="roundrectangle", font_style="italic", underlined_text="false", img=photo, width="200", height="200", border_has_color="false") count_graph_users += 1 else: logger.debug( "Graph contains user node (id={}).".format(id)) settings.print_message( "Graph already contains this user node.", 3) settings.print_message("Get friendlist.", 3) logger.debug("Get friends for user (id={}).".format(id)) try: friends = api.friends.get( user_id=id, count=1000000, fields= "nickname, sex, bdate, city, country, photo_200_orig, photo_200, photo_100" ) if not friends: raise Exception("User info is empty.") except Exception as error: logger.warn(traceback.format_exc()) logger.debug("Can not get friends, skip.") settings.print_message("Can not get friendlist, skip.", 3) count_bad_users += 1 continue settings.print_message( "Process friends (total {}, level #{}).".format( len(friends), step + 1), 3) logger.debug("Friends count: {}".format(len(friends))) _ = [ level_queue.append(friend["user_id"]) for i, friend in enumerate(friends) if not friend["user_id"] in graph.nodes and i < settings.PARAMS["max_processing_friends"] ] logger.debug("Add node for each friend and create edges.") for friend_index, friend in enumerate(friends): total_users += 1 logger.debug("Process friend #{} (id={}).".format( friend_index, friend["user_id"])) settings.print_message( "Process friends #{} id={} (total {}, level #{}).". format(friend_index, friend["user_id"], len(friends), step + 1), 4) if friend_index > settings.PARAMS["max_processing_friends"]: break settings.print_message("Add user node in graph.", 5) logger.debug("Check user (id={}) in graph".format( friend["user_id"])) if not friend["user_id"] in graph.nodes.keys(): logger.debug( "Create user node in graph (id={}).".format( friend["user_id"])) logger.debug("User info='{}'.".format( json.dumps(friend))) logger.debug("Load user photo (id={}).".format( friend["user_id"])) photo = utils.get_request( friend[settings.VK_PHOTO_1 if settings. VK_PHOTO_1 in friend else settings.VK_PHOTO_2]) if not photo: logger.debug( "Can't loading user photo (id={}).".format( friend["user_id"])) info_label = "ФИГ: {} {} \nНик: {} \nID: {} \nПол: {} \nДата рождения: {} \nГород: {} \nСтрана: {}".format( friend["first_name"] if "first_name" in friend else "----", friend["last_name"] if "last_name" in friend else "----", friend["nickname"] if "nickname" in friend else "----", friend["user_id"], utils.SEX[friend["sex"]] if "sex" in friend else "----", friend["bdate"] if "bdate" in friend else "----", friend["city"] if "city" in friend else "----", friend["country"] if "country" in friend else "----") graph.add_node(friend["user_id"], check_existance=False, label=info_label, shape="roundrectangle", font_style="italic", underlined_text="false", img=photo, width="200", height="200", border_has_color="false") count_graph_users += 1 else: logger.debug( "Graph contains user node (id={}).".format( friend["user_id"])) settings.print_message( "Graph already contains this user node.", 5) logger.debug("Add edge {}-{} in graph.".format( friend["user_id"], id)) # if ... graph.add_edge(id, friend["user_id"], width="1.0", color="#000000", check_existance_nodes=False) except Exception as error: logger.warn(traceback.format_exc()) result = "with error" level_queue = level_queue[level_counter:] logger.debug("Recovering the last level link.") settings.print_message("Recovering the last level link.") for user_index, user_id in enumerate(level_queue): settings.print_message( "Process id {}. User #{} on last level (total {})".format( user_id, user_index, len(level_queue)), 2) settings.print_message("Get friendlist.", 2) logger.debug("Get friends for user (id={}).".format(user_id)) loop_counter = settings.MAX_RETRY friends = None while (loop_counter > 0): try: loop_counter -= 1 friends = api.friends.get(user_id=user_id, count=1000000) time.sleep(0.3) if not friends: raise utils.EmptyDataException("User info is empty.") break except vk.exceptions.VkAPIError as error: logger.warn(traceback.format_exc()) if error.code == 6: time.sleep(0.4) continue else: break except utils.EmptyDataException as error: logger.warn(traceback.format_exc()) break except Exception as error: logger.warn(traceback.format_exc()) count_bad_users += 1 loop_counter = settings.MAX_RETRY settings.print_message("Can not get friendlist, skip?", 2) if input("[y/n]: ") == 'n': continue logger.debug("Can not get friends, skip.") break if not friends: continue settings.print_message( "Process friends (total {}).".format(len(friends)), 3) logger.debug("Friends count: {}".format(len(friends))) for friend_index, friend_id in enumerate(friends): if friend_id in graph.nodes.keys(): logger.debug("Add edge {}-{} in graph.".format( friend_id, user_id)) graph.add_edge(user_id, friend_id, width="1.0", color="#000000", check_existance_nodes=False) logger.debug("Create graphml for graph.") graph.construct_graphml() logger.debug("Save graphml in file {}.".format(settings.OUTPUT_FILE)) try: with open(settings.OUTPUT_FILE, "w", encoding=settings.OUTPUT_ENCODING) as f: f.write(graph.get_graph()) if os.path.exists("backup_{}".format(settings.OUTPUT_FILE)): os.remove("backup_{}".format(settings.OUTPUT_FILE)) except Exception as error: logger.warn(traceback.format_exc()) result = "with error" return (result, total_users, count_graph_users, count_bad_users)
def get_friends_of_users(uids): """ Get lists of friends by users with uid of uids and create social graph """ total_users = 0 count_bad_users = 0 count_graph_users = 0 save_counter = settings.SAVE_COUNT result = "successful" settings.print_message( "Create VK session for application (app_id={})".format( settings.VK_APPLICATION_ID)) logger.debug("Create VK session for application (app_id={}).".format( settings.VK_APPLICATION_ID)) try: session = vk.Session(access_token=settings.VK_ACCESS_TOKEN) api = vk.API(session) except Exception as error: logger.warn(traceback.format_exc()) settings.print_message( "Can't create VK session for application with app_id={}".format( settings.VK_APPLICATION_ID)) return ("with error.", 0, 0, 0) graph = yEdGraph.Graph() for user_index, id in enumerate(uids): total_users += 1 save_counter -= 1 if save_counter <= 0: save_counter = settings.SAVE_COUNT logger.debug("Create graphml for graph.") graph.construct_graphml() logger.debug("Save graphml in backup file backup_{}.".format( settings.OUTPUT_FILE)) try: with open("backup_{}".format(settings.OUTPUT_FILE), "w", encoding=settings.OUTPUT_ENCODING) as f: f.write(graph.get_graph()) except Exception as error: logger.warn(traceback.format_exc()) logger.warn("Can not create backup file.") settings.print_message( "Process id {}. User #{} (total {})".format( id, user_index, len(uids)), 2) logger.debug("Process id {}. User #{} (total {})".format( id, user_index, len(uids))) try: settings.print_message("Add user node in graph.", 3) logger.debug("Check user (id={}) in graph".format(id)) if not id in graph.nodes.keys(): logger.debug("Create user node in graph (id={}).".format(id)) logger.debug("Get info for user (id={}).".format(id)) try: user_info = api.users.get( user_ids=[id], fields= "nickname, sex, bdate, city, country, photo_200_orig, photo_200, photo_100" ) if not user_info: raise Exception("User info is empty.") except Exception as error: logger.warn(traceback.format_exc()) logger.debug( "Can not get info for user (id={}), skip.".format(id)) settings.print_message("Can not get info for user, skip.", 3) count_bad_users += 1 continue user_info = user_info[0] id = user_info["uid"] logger.debug("User info='{}'.".format(json.dumps(user_info))) logger.debug("Load user photo (id={}).".format(id)) photo = utils.get_request( user_info[settings.VK_PHOTO_1 if settings.VK_PHOTO_1 in user_info else settings.VK_PHOTO_2]) if not photo: logger.debug( "Can't loading user photo (id={}).".format(id)) info_label = "ФИГ: {} {} \nНик: {} \nID: {} \nПол: {} \nДата рождения: {} \nГород: {} \nСтрана: {}".format( user_info["first_name"] if "first_name" in user_info else "----", user_info["last_name"] if "last_name" in user_info else "----", user_info["nickname"] if "nickname" in user_info else "----", id, utils.SEX[user_info["sex"]] if "sex" in user_info else "----", user_info["bdate"] if "bdate" in user_info else "----", user_info["city"] if "city" in user_info else "----", user_info["country"] if "country" in user_info else "----") graph.add_node(id, check_existance=False, label=info_label, shape="roundrectangle", font_style="italic", underlined_text="false", img=photo, width="200", height="200", border_has_color="false") count_graph_users += 1 else: logger.debug("Graph contains user node (id={}).".format(id)) settings.print_message( "Graph already contains this user node.", 3) settings.print_message("Get friendlist.", 3) logger.debug("Get friends for user (id={}).".format(id)) try: friends = api.friends.get( user_id=id, count=1000000, fields= "nickname, sex, bdate, city, country, photo_200_orig, photo_200, photo_100" ) if not friends: raise Exception("User info is empty.") except Exception as error: logger.warn(traceback.format_exc()) logger.debug("Can not get friends, skip.") settings.print_message("Can not get friendlist, skip.", 3) count_bad_users += 1 continue settings.print_message( "Process friends (total {}).".format(len(friends)), 3) logger.debug("Friends count: {}".format(len(friends))) logger.debug("Add node for each friend and create edges.") for friend_index, friend in enumerate(friends): total_users += 1 logger.debug("Process friend #{} (id={}).".format( friend_index, friend["user_id"])) settings.print_message( "Process friends #{} id={} (total {}).".format( friend_index, friend["user_id"], len(friends)), 4) if friend_index > settings.PARAMS["max_processing_friends"]: break settings.print_message("Add user node in graph.", 5) logger.debug("Check user (id={}) in graph".format( friend["user_id"])) if not friend["user_id"] in graph.nodes.keys(): logger.debug("Create user node in graph (id={}).".format( friend["user_id"])) logger.debug("User info='{}'.".format(json.dumps(friend))) logger.debug("Load user photo (id={}).".format( friend["user_id"])) photo = utils.get_request( friend[settings.VK_PHOTO_1 if settings.VK_PHOTO_1 in friend else settings.VK_PHOTO_2]) if not photo: logger.debug( "Can't loading user photo (id={}).".format( friend["user_id"])) info_label = "ФИГ: {} {} \nНик: {} \nID: {} \nПол: {} \nДата рождения: {} \nГород: {} \nСтрана: {}".format( friend["first_name"] if "first_name" in friend else "----", friend["last_name"] if "last_name" in friend else "----", friend["nickname"] if "nickname" in friend else "----", friend["user_id"], utils.SEX[friend["sex"]] if "sex" in friend else "----", friend["bdate"] if "bdate" in friend else "----", friend["city"] if "city" in friend else "----", friend["country"] if "country" in friend else "----") graph.add_node(friend["user_id"], check_existance=False, label=info_label, shape="roundrectangle", font_style="italic", underlined_text="false", img=photo, width="200", height="200", border_has_color="false") count_graph_users += 1 else: logger.debug("Graph contains user node (id={}).".format( friend["user_id"])) settings.print_message( "Graph already contains this user node.", 5) logger.debug("Add edge {}-{} in graph.".format( friend["user_id"], id)) # if ... graph.add_edge(id, friend["user_id"], width="1.0", color="#000000", check_existance_nodes=False) except Exception as error: logger.warn(traceback.format_exc()) result = "with error" logger.debug("Create graphml for graph.") graph.construct_graphml() logger.debug("Save graphml in file {}.".format(settings.OUTPUT_FILE)) try: with open(settings.OUTPUT_FILE, "w", encoding=settings.OUTPUT_ENCODING) as f: f.write(graph.get_graph()) if os.path.exists("backup_{}".format(settings.OUTPUT_FILE)): os.remove("backup_{}".format(settings.OUTPUT_FILE)) except Exception as error: logger.warn(traceback.format_exc()) result = "with error" return (result, total_users, count_graph_users, count_bad_users)
def save_code(self, pr_id, pr_name): # Generate pseudo-key-id __p_id = "p_" + str(pr_id) # Generate metadata from gitlab __branches_gl_info = {} __branches = self.gl_instance.get_projects_repository_branches_byId( id=pr_id) [__branches_gl_info.update({x.get("name"): x}) for x in __branches] # Generate metadata from redis __branches_rd_info = {} __branches = self.rd_instance_br.keys(__p_id + ":*") [ __branches_rd_info.update({ base64.b16decode(x.split(":")[1]): self.rd_instance_br.hgetall(x) }) for x in __branches ] # Generate difference and intersection metadata __mt_diff = set(__branches_gl_info.keys()).difference( set(__branches_rd_info.keys())) __mt_int = set(__branches_gl_info.keys()).intersection( set(__branches_rd_info.keys())) __mt_mod = list(__mt_diff.union(__mt_int)) __mt_del = list( set(__branches_rd_info.keys()).difference( set(__branches_gl_info.keys()))) # Structure for removed commits __mt_del_commits = set() # Delete information about Branch count = 0 for i in __mt_del: # Number of deleted branches count += 1 # Print alert if config.DEBUGGER: config.print_message("* (%d) [%d/%d] Deleted %s" % (int(pr_id), count, len(__mt_del), i)) # Get information from redis __br_info = __branches_rd_info[i] # Generate pseudo-key-id and remove info __br_id = __p_id + ":" + __br_info.get("id") self.rd_instance_br.delete(__br_id) self.rd_instance_br_co.delete(__br_id) # Remove links with contributors __br_con = eval(__br_info.get("contributors")) for j in __br_con: __us_com = self.rd_instance_us_co.smembers(j) for x in __us_com: if str(x).startswith(__br_id): __mt_del_commits.add( str(x).split(":")[0] + ":" + str(x).split(":")[2]) self.rd_instance_us_co.srem(j, x) # Remove all unique commits if len(__mt_del_commits) > 0: __rd_branch_co = set() __rd_branch = self.rd_instance_br.keys(__p_id + "*") for i in __rd_branch: __rd_branch_co = __rd_branch_co.union( set(dict(self.rd_instance_br_co.zrange(i, 0, -1)).keys())) for i in __mt_del_commits: if i not in __rd_branch_co: self.rd_instance_co.delete(i) # Update information about Branch count = 0 for i in __mt_mod: # Number of reviewed branches count += 1 # Print alert if config.DEBUGGER: config.print_message("* (%d) [%d/%d] Reviewed %s" % (int(pr_id), count, len(__mt_mod), i)) # Clean information __br_info = __branches_gl_info[i] st_clean.branch(__br_info) # Generate pseudo-key-id __br_id = __p_id + ":" + __br_info.get("id") # Save / Replace information at redis self.rd_instance_br.hmset(__br_id, __br_info) # Update information about branch's commits util_commit.update(self, pr_id, pr_name, i)
def dispatch(command): result = None logger.debug("command %s.", command) start_time = datetime.now() try: for case in utils.Switch(command): if case("getFriendsGraph"): logger.debug("Processing command '%s'." % command) settings.print_message("Processing command '%s'." % command) # START COMMAND result = get_friends_graph() logger.debug( "Processing %s. Total users: %i. Users in graph: %i Bad requests: %i." % result) settings.print_message( "Processing %s. Total users: %i. Users in graph: %i Bad requests: %i." % result) break if case(): # default logger.warn("Unknown command: %s" % command) settings.print_message("Unknown command: %s" % command) break except KeyboardInterrupt: settings.print_message( "Caught KeyboardInterrupt, terminating processing") except: logger.error(traceback.format_exc()) settings.print_message("Processing finished with error.") settings.print_message("For more details, see the log.") end_time = datetime.now() settings.print_message("Run began on {0}".format(start_time)) settings.print_message("Run ended on {0}".format(end_time)) settings.print_message("Elapsed time was: {0}".format(end_time - start_time)) logger.debug("Run began on {0}".format(start_time)) logger.debug("Run ended on {0}".format(end_time)) logger.debug("Elapsed time was: {0}".format(end_time - start_time))
def _cluster_handler(cluster_id, papers_count): logger.debug("Handle %i papers from cluster %s." % (papers_count, cluster_id)) url = _FULLURL.format(_HOST, _SCHOLARCLUSTER.format(cluster_id)) logger.debug("Get cluster page URL='{0}'.".format(url)) soup = utils.get_soup(url) #utils.soup2file(soup, "D:\A.html") # This dictionary contains info about unique papers EndNote_list = list() file_counter = 0 merged_counter = 0 # return true if EndNote_1 equal EndNote_2 def is_EndNote_equal(EndNote_1, EndNote_2): return \ EndNote_1["title"].lower() == EndNote_2["title"].lower() and \ ( not "year" in EndNote_1 or not "year" in EndNote_2 or EndNote_1["year"] == EndNote_2["year"] ) \ and len(EndNote_1["author"]) == len(EndNote_1["author"]) \ and EndNote_1["type"] == EndNote_2["type"] and \ ( not "pages" in EndNote_1 or not "pages" in EndNote_2 or EndNote_1["pages"] == EndNote_2["pages"] ) # return list of similar papers (maybe empty) def intersect_papers(EndNote_data, EndNote_list): return \ [i for i in EndNote_list if is_EndNote_equal(EndNote_data, i)] # Loop on pages while True: if soup is None: logger.debug( "Soup for cluster page URL='{0}' is None.".format(url)) return None # This list contains links to EndNote and cited by count for each paper # in cluster logger.debug("Find EndNote links for each paper in cluster.") footer_links = [{ "EndNote" if "EndNote" in link.text else "citedby": link["href"].strip() if "EndNote" in link.text else int( re.findall(r'\d+', link.text)[0]) for link in paper_block.find("div", class_="gs_fl").find_all('a') if "EndNote" in link.text or "Cited" in link.text or "Цитируется" in link.text } for paper_block in soup.find_all('div', class_='gs_ri')] logger.debug( "Extract unique papers in cluster and load data from EndNote.") for links in footer_links: if links != {}: file_counter += 1 logger.debug("EndNote file #%i (total %i)" % (file_counter, papers_count)) if links.get("EndNote"): paper_EndNote_data = get_info_from_EndNote( links["EndNote"], True) else: settings.print_message( 'Error getting EndNote files. ' 'Please change the display settings Google Scholar in English ' '(https://scholar.google.com/).') logger.debug( 'End work programme because did not find link to EndNote file.' ) raise Exception('Did not find EndNote.') if paper_EndNote_data is None: logger.debug( "Skip EndNote file #%i, could not upload file." % file_counter) continue if not "year" in paper_EndNote_data or not "author" in paper_EndNote_data: logger.debug( "Skip EndNote file #%i, empty year or authors fields." % file_counter) else: similar_papers = intersect_papers(paper_EndNote_data, EndNote_list) if similar_papers == []: merged_counter += 1 logger.debug( "EndNote file #%i miss all EndNote files in merged array." % file_counter) logger.debug("Add EndNote file #%i in merged array." % file_counter) paper_EndNote_data.update({ "url_scholarbib": links["EndNote"], "citedby": links["citedby"] if "citedby" in links else None }) EndNote_list.append(paper_EndNote_data) else: similar_file = similar_papers[0] similar_file_index = EndNote_list.index(similar_file) if len(similar_file) < len(paper_EndNote_data): logger.debug( "EndNote file #{0} like #{1} EndNote file in merged array and has more fields, replace." .format(file_counter, similar_file_index + 1)) EndNote_list[ similar_file_index] = paper_EndNote_data else: logger.debug( "EndNote file #{0} like #{1} EndNote file in merged array, skipped." .format(file_counter, similar_file_index + 1)) # NEXT button on html page if soup.find(class_='gs_ico gs_ico_nav_next'): url = soup.find( class_='gs_ico gs_ico_nav_next').parent['href'].strip() logger.debug("Load next page in resulting query selection.") soup = utils.get_soup(_FULLURL.format(_HOST, url)) else: break if merged_counter == 0: logger.debug( "All %i EndNote files in the cluster are not informative. No merged files." % file_counter) else: logger.debug( "All {0} EndNote files merged in {1} (i.e. distinct versions in cluster: {1}):" .format(file_counter, merged_counter)) for counter, data in enumerate(EndNote_list): logger.debug("Merged EndNote file #%i:\n%s" % (counter + 1, data["EndNote"])) return tuple(EndNote_list)
def get_friends_graph(): """ Get friends for users and create social graph. """ total_users = 0 count_bad_users = 0 count_graph_users = 0 result = "was successful" settings.print_message( "Create VK session for application (app_id={})".format( settings.VK_APPLICATION_ID)) logger.debug("Create VK session for application (app_id={}).".format( settings.VK_APPLICATION_ID)) try: session = vk.Session(access_token=settings.VK_ACCESS_TOKEN) api = vk.API(session) except Exception as error: logger.warn(traceback.format_exc()) settings.print_message( "Can't create VK session for application with app_id={}".format( settings.VK_APPLICATION_ID)) return ("with error.", 0, 0, 0) # logger.debug("".format()) graph = yEdGraph.Graph() level_queue = [ int(settings.PARAMS["user_id"]), ] count_levels = int(settings.PARAMS["levels"]) all_level_ids = [list() for _ in range(count_levels)] for step in range(count_levels): settings.print_message("Process level #{} (total users {})".format( step, level_counter)) logger.debug("Process level #{} (total users {})".format( step, level_counter)) level_counter = len(level_queue) for user_index in range(level_counter): id = level_queue[user_index] total_users += 1 settings.print_message( "Process id {}. User #{} on level #{} (total {})".format( id, user_index, step, level_counter), 2) logger.debug( "Process id {}. User #{} on level #{} (total {}).".format( id, user_index, step, level_counter)) try: settings.print_message("Add user node in graph.", 3) logger.debug("Check user (id={}) in graph".format(id)) if not id in graph.nodes.keys(): logger.debug( "Create user node in graph (id={}).".format(id)) logger.debug("Get info for user (id={}).".format(id)) try: user_info = api.users.get(user_ids=[id], fields="photo_200_orig") except Exception as error: logger.warn(traceback.format_exc()) logger.debug( "Can not get info for user (id={}), skip.".format( id)) settings.print_message( "Can not get info for user, skip.", 3) count_bad_users += 1 continue if not user_info: logger.debug( "Can not get info for user (id={}), skip.".format( id)) settings.print_message( "Can not get info for user, skip.", 3) count_bad_users += 1 continue user_info = user_info[0] logger.debug("User info='{}'.".format( json.dumps(user_info))) logger.debug("Load user photo (id={}).".format(id)) photo = utils.get_request(user_info["photo_200_orig"]) if not photo: logger.debug( "Can't loading user photo (id={}).".format(id)) info_label = "ID: {} \nФИГ: {} {} \nНик: {} \nПол: {} \nДата рождения: {} \nГород: {} \nСтрана: {}".format( id, user_info["first_name"] if "first_name" in user_info else "----", user_info["last_name"] if "last_name" in user_info else "----", user_info["nickname"] if "nickname" in user_info else "----", utils.SEX[user_info["sex"]] if "sex" in user_info else "----", user_info["bdate"] if "bdate" in user_info else "----", user_info["city"] if "city" in user_info else "----", user_info["country"] if "country" in user_info else "----") graph.add_node(id, check_existance=False, label=info_label, shape="roundrectangle", font_style="italic", underlined_text="false", img=photo, width="200", height="200", border_has_color="false") count_graph_users += 1 else: logger.debug( "Graph contains user node (id={}).".format(id)) settings.print_message( "Graph already contains this user node.", 3) settings.print_message("Get friendlist.", 3) logger.debug("Get friend for user (id={}).".format(id)) try: friends = api.friends.get( user_id=id, count=1000000, fields= "nickname, sex, bdate, city, country, photo_200_orig") except Exception as error: logger.debug("Can not get friends, skip.") settings.print_message("Can not get friendlist, skip.", 3) count_bad_users += 1 continue if not friends: logger.debug("Can not get friends, skip.") settings.print_message("Can not get friendlist, skip.", 3) count_bad_users += 1 continue settings.print_message( "Process friends (total {}, level #{}).".format( len(friends), step + 1), 3) logger.debug("Friends count: {}".format(len(friends))) _ = [ level_queue.append(friend["user_id"]) for friend in friends if not friend["user_id"] in graph.nodes ] logger.debug("Add node for each friend and create edges.") for friend_index, friend in enumerate(friends): total_users += 1 logger.debug("Process friend #{} (id={}).".format( friend_index, friend["user_id"])) settings.print_message( "Process friends #{} id={} (total {}, level #{}).". format(friend_index, friend["user_id"], len(friends), step + 1), 4) #if friend_index > 10: break settings.print_message("Add user node in graph.", 5) logger.debug("Check user (id={}) in graph".format( friend["user_id"])) if not friend["user_id"] in graph.nodes.keys(): logger.debug( "Create user node in graph (id={}).".format( friend["user_id"])) logger.debug("User info='{}'.".format( json.dumps(friend))) logger.debug("Load user photo (id={}).".format( friend["user_id"])) photo = utils.get_request(friend["photo_200_orig"]) if not photo: logger.debug( "Can't loading user photo (id={}).".format( friend["user_id"])) info_label = "ID: {} \nФИГ: {} {} \nНик: {} \nПол: {} \nДата рождения: {} \nГород: {} \nСтрана: {}".format( friend["user_id"], friend["first_name"] if "first_name" in friend else "----", friend["last_name"] if "last_name" in friend else "----", friend["nickname"] if "nickname" in friend else "----", utils.SEX[friend["sex"]] if "sex" in friend else "----", friend["bdate"] if "bdate" in friend else "----", friend["city"] if "city" in friend else "----", friend["country"] if "country" in friend else "----") graph.add_node(friend["user_id"], check_existance=False, label=info_label, shape="roundrectangle", font_style="italic", underlined_text="false", img=photo, width="200", height="200", border_has_color="false") count_graph_users += 1 else: logger.debug( "Graph contains user node (id={}).".format( friend["user_id"])) settings.print_message( "Graph already contains this user node.", 5) logger.debug("Add adge {}-{} in graph.".format( friend["user_id"], id)) # if ... graph.add_edge(friend["user_id"], id, width="1.0", color="#000000", check_existance_nodes=False) except Exception as error: logger.warn(traceback.format_exc()) result = "with error" level_queue = level_queue[level_counter:] logger.debug("Create graphml for graph.") graph.construct_graphml() logger.debug("Save graphml in file {}.".format(settings.OUTPUT_FILE)) try: with open(settings.OUTPUT_FILE, "w", encoding=settings.OUTPUT_ENCODING) as f: f.write(graph.get_graph()) except Exception as error: logger.warn(traceback.format_exc()) result = "with error" return (result, total_users, count_graph_users, count_bad_users)
def update(self, pr_id, pr_name, br_name): # Generate pseudo-key-id __pr_id = "p_" + str(pr_id) __br_id = __pr_id + ":" + base64.b16encode(br_name) # Data structure for branch's collaborators __br_info_collaborators = set() # Create Redis Data structure (id + score, in this case timestamp) __co_br = [] # Get all commits from specific branch (gitlab) ids + commit's info __co_gl_val = self.gl_instance.get_projects_repository_commits_byId(id=pr_id, ref_name=br_name) __co_gl_id = map(lambda x: __pr_id + ":" + x.get("id"), __co_gl_val) __co_gl_val = dict(zip(__co_gl_id, __co_gl_val)) # Get all commits from specific branch (redis) ids + created_at __co_rd_id = [] __co_rd_val = {} __prev_info = len(self.rd_instance_br_co.keys(__br_id)) > 0 if __prev_info: __br_info_collaborators = set(eval(self.rd_instance_br.hgetall(__br_id).get("contributors"))) __co_rd_val = dict(self.rd_instance_br_co.zrange(__br_id, 0, -1, withscores=True)) __co_rd_id = __co_rd_val.keys() # Generate difference and intersection metadata __mt_new = list(set(__co_gl_id).difference(set(__co_rd_id))) __mt_del = list(set(__co_rd_id).difference(set(__co_gl_id))) __mt_mod = list(set(__co_gl_id).intersection(set(__co_rd_id))) # Fill branch's commits without deleted if __prev_info: [__co_br.extend([i, long(__co_rd_val[i])]) for i in __mt_mod] # Regenerate structure of branch if len(__mt_new) > 0 or len(__mt_del) > 0: self.rd_instance_br_co.delete(__br_id) # Update or add commits to redis for i in __mt_new: # Get commit identifier (sha) + info __co_id = i __co_id_org = str(__co_id).replace(__pr_id + ":", "") # Get email from commit and add as contributor __co_em = __co_gl_val[__co_id].get("author_email").lower() __user_key = base64.b16encode(__co_em) __br_info_collaborators.add(__user_key) # Get information from gitlab or redis if len(self.rd_instance_co.keys(__co_id)) == 0: __co_info = __co_gl_val[__co_id] st_clean.commit(__co_info) # Get commit information from git log get_commit_info(pr_id, pr_name, __co_info) __co_info["author"] = __user_key # Insert commit information self.rd_instance_co.hmset(__co_id, __co_info) else: __co_info = self.rd_instance_co.hgetall(__co_id) # Set values at Redis Structure - User self.rd_instance_us_co.zadd(__user_key, __br_id + ":" + __co_id_org, long(__co_info.get("created_at"))) # Set values at Redis Structure - Branch (id + timestamp) __co_br.append(__co_id) __co_br.append(long(__co_info.get("created_at"))) for i in __mt_del: # Get commit identifier (sha) + info __co_id = i __co_id_org = str(__co_id).replace(__pr_id + ":", "") __co_info = self.rd_instance_co.hgetall(__co_id) # Get email from commit and add as contributor __co_em = __co_info.get("author_email").lower() __user_key = base64.b16encode(__co_em) self.rd_instance_us_co.zrem(__user_key, __br_id + ":" + __co_id_org) # Check if contributors keep being same if len(__mt_del) > 0: __br_info_collaborators_tmp = __br_info_collaborators.copy() for i in __br_info_collaborators: count_co = 0 __br_us_co = self.rd_instance_us_co.zrange(i, 0, -1) for j in __br_us_co: if str(j).startswith(__br_id): count_co = 1 break if count_co == 0: __br_info_collaborators_tmp.remove(i) __br_info_collaborators = __br_info_collaborators_tmp # Inject commits to branch from data structure filled if len(__mt_new) > 0 or len(__mt_del) > 0: inject.inject_branch_commits(self.rd_instance_br_co, pr_id, br_name, __co_br) # Insert information to branch self.rd_instance_br.hset(__br_id, "contributors", list(__br_info_collaborators)) if len(__mt_new) > 0: # Print alert if config.DEBUGGER: config.print_message("* (%d) Added %d Commits" % (int(pr_id), len(__mt_new))) if len(__mt_del) > 0: # Print alert if config.DEBUGGER: config.print_message("* (%d) Deleted %d Commits" % (int(pr_id), len(__mt_del)))
def _get_info_from_resulting_selection(paper_soup, handling_cluster=False): """retrieving data about an article in the resulting selection""" # Full info about paper include general and addition information # MAYBE no one addition information, because this paper in cluster # and for each paper from cluster contains additional info settings.print_message("Google scholar:", 2) settings.print_message("Get general information.", 3) full_info = dict() general_information = dict() databox = paper_soup.find('div', class_='gs_ri') title = databox.find('h3', class_='gs_rt') if title.find('span', class_='gs_ct'): # A citation title.span.extract() elif title.find('span', class_='gs_ctc'): # A book or PDF title.span.extract() general_information['title'] = title.text.strip() if title.find('a'): general_information['url'] = title.find('a')['href'].strip() paperinfo = databox.find('div', class_='gs_a') author_list = list() author_ref_list = paperinfo('a') ref_index = 0 ref_list_len = len(author_ref_list) for auth_shortname in paperinfo.text.split("-")[0].split(","): GID = "" auth_shortname = auth_shortname.strip(" …\xa0") if ref_list_len > ref_index and auth_shortname == author_ref_list[ ref_index].text: GID = re.findall(_CITATIONAUTHRE, author_ref_list[ref_index]['href'].strip())[0] ref_index += 1 author_list.append({"shortname": auth_shortname, "gid": GID}) general_information['author'] = author_list year = re.findall("[0-9]{4}", paperinfo.text) if len(year) != 0: general_information['year'] = int(year[0]) # Save general info full_info["general_information"] = general_information settings.print_message("Title: '%s'" % general_information['title'], 3) # Get addition information (maybe paper in cluster then analysis cluster # and get additional info for each unique paper in cluster) footer_links = databox.find('div', class_='gs_fl').find_all('a') settings.print_message("Get additional information.", 3) count_sim_papers = 0 for link in footer_links: if 'versions' in link.text or 'версии статьи' in link.text: count_sim_papers = int(re.findall(r'\d+', link.text.strip())[0]) logger.debug("In cluster %i papers." % count_sim_papers) general_information["cluster"] = int( re.findall(r'\d+', link['href'].strip())[0]) break # check: have paper link to pdf # and take this link if exists link_to_pdf = _get_url_pdf(paper_soup) full_info['link_to_pdf'] = link_to_pdf # CLUSTER HANDLER if handling_cluster and general_information["cluster"] is not None: settings.print_message( "In cluster %i similar papers." % count_sim_papers, 3) settings.print_message("Cluster handling...", 3) different_information = _cluster_handler( general_information["cluster"], count_sim_papers) if different_information is not None: full_info["different_information"] = different_information settings.print_message( "Versions in cluster: %i." % len(different_information), 3) return full_info # Paper not in cluster => get addition info for it if handling_cluster: settings.print_message("Cluster link not exists.", 3) else: settings.print_message("Don't use google cluster.", 3) different_information = list() different_information.append(dict()) is_end_note = False for link in footer_links: if 'endnote' in link.text.strip().lower(): is_end_note = True end_note = get_info_from_EndNote(link['href'].strip(), True) if end_note is not None: different_information[0].update(end_note) different_information[0]["url_scholarbib"] = link['href'].strip() if 'Cited by' in link.text or 'Цитируется' in link.text: #utils.get_soup(_HOST + link['href'].strip()) different_information[0]["citedby"] = int( re.findall(r'\d+', link.text)[0]) if not is_end_note: settings.print_message( 'Error getting EndNote files. ' 'Please change the display settings Google Scholar in English ' '(https://scholar.google.com/).') logger.debug( 'End work programme because did not find link to EndNote file.') input('Press enter to continue') #raise Exception('Did not find EndNote.') full_info["different_information"] = tuple(different_information) return full_info
def save_json_metadata(metadata): settings.print_message(" - Saving metadata at json format ... ") f = open("generated/data/metadata.json", "w") f.write(json.dumps(metadata)) f.close()
def upload_package(): settings.print_message(" - Uploading pypi package ... ") os.chdir("generated") call(["python", "setup.py", "sdist", "register", "upload", "-r", "pypi"]) os.chdir("./../")
def update(self, pr_id, pr_name, br_name): # Generate pseudo-key-id __pr_id = "p_" + str(pr_id) __br_id = __pr_id + ":" + base64.b16encode(br_name) # Data structure for branch's collaborators __br_info_collaborators = set() # Create Redis Data structure (id + score, in this case timestamp) __co_br = [] # Get all commits from specific branch (gitlab) ids + commit's info __co_gl_val = self.gl_instance.get_projects_repository_commits_byId( id=pr_id, ref_name=br_name) __co_gl_id = map(lambda x: __pr_id + ":" + x.get("id"), __co_gl_val) __co_gl_val = dict(zip(__co_gl_id, __co_gl_val)) # Get all commits from specific branch (redis) ids + created_at __co_rd_id = [] __co_rd_val = {} __prev_info = len(self.rd_instance_br_co.keys(__br_id)) > 0 if __prev_info: __br_info_collaborators = set( eval(self.rd_instance_br.hgetall(__br_id).get("contributors"))) __co_rd_val = dict( self.rd_instance_br_co.zrange(__br_id, 0, -1, withscores=True)) __co_rd_id = __co_rd_val.keys() # Generate difference and intersection metadata __mt_new = list(set(__co_gl_id).difference(set(__co_rd_id))) __mt_del = list(set(__co_rd_id).difference(set(__co_gl_id))) __mt_mod = list(set(__co_gl_id).intersection(set(__co_rd_id))) # Fill branch's commits without deleted if __prev_info: [__co_br.extend([i, long(__co_rd_val[i])]) for i in __mt_mod] # Regenerate structure of branch if len(__mt_new) > 0 or len(__mt_del) > 0: self.rd_instance_br_co.delete(__br_id) # Update or add commits to redis for i in __mt_new: # Get commit identifier (sha) + info __co_id = i __co_id_org = str(__co_id).replace(__pr_id + ":", "") # Get email from commit and add as contributor __co_em = __co_gl_val[__co_id].get('author_email').lower() __user_key = base64.b16encode(__co_em) __br_info_collaborators.add(__user_key) # Get information from gitlab or redis if len(self.rd_instance_co.keys(__co_id)) == 0: __co_info = __co_gl_val[__co_id] st_clean.commit(__co_info) # Get commit information from git log get_commit_info(pr_id, pr_name, __co_info) __co_info["author"] = __user_key # Insert commit information self.rd_instance_co.hmset(__co_id, __co_info) else: __co_info = self.rd_instance_co.hgetall(__co_id) # Set values at Redis Structure - User self.rd_instance_us_co.zadd(__user_key, __br_id + ":" + __co_id_org, long(__co_info.get("created_at"))) # Set values at Redis Structure - Branch (id + timestamp) __co_br.append(__co_id) __co_br.append(long(__co_info.get("created_at"))) for i in __mt_del: # Get commit identifier (sha) + info __co_id = i __co_id_org = str(__co_id).replace(__pr_id + ":", "") __co_info = self.rd_instance_co.hgetall(__co_id) # Get email from commit and add as contributor __co_em = __co_info.get('author_email').lower() __user_key = base64.b16encode(__co_em) self.rd_instance_us_co.zrem(__user_key, __br_id + ":" + __co_id_org) # Check if contributors keep being same if len(__mt_del) > 0: __br_info_collaborators_tmp = __br_info_collaborators.copy() for i in __br_info_collaborators: count_co = 0 __br_us_co = self.rd_instance_us_co.zrange(i, 0, -1) for j in __br_us_co: if str(j).startswith(__br_id): count_co = 1 break if count_co == 0: __br_info_collaborators_tmp.remove(i) __br_info_collaborators = __br_info_collaborators_tmp # Inject commits to branch from data structure filled if len(__mt_new) > 0 or len(__mt_del) > 0: inject.inject_branch_commits(self.rd_instance_br_co, pr_id, br_name, __co_br) # Insert information to branch self.rd_instance_br.hset(__br_id, 'contributors', list(__br_info_collaborators)) if len(__mt_new) > 0: # Print alert if config.DEBUGGER: config.print_message("* (%d) Added %d Commits" % (int(pr_id), len(__mt_new))) if len(__mt_del) > 0: # Print alert if config.DEBUGGER: config.print_message("* (%d) Deleted %d Commits" % (int(pr_id), len(__mt_del)))