from ylib.yaml_config import Configuraion from urllib.parse import quote_plus config = Configuraion() config.load('/home/weiwu/projects/deep_learning/web_crawl/config.yaml') USER_AGENT = config.USER_AGENT DOMAIN = config.DOMAIN BLACK_DOMAIN = config.BLACK_DOMAIN URL_SEARCH = config.URL_GOOGLE_SEARCH PROXIES = config.PROXIES URL_SEARCH = config.URL_GOOGLE_SCHOLAR URL_NEXT = config.URL_GOOGLE_SCHOLAR_NEXT ylog.set_level(logging.DEBUG) ylog.console_on() ylog.filelog_on("app") # log config logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logger = logging.getLogger('Sci-Hub') logger.setLevel(logging.DEBUG) # constants SCIHUB_BASE_URL = 'http://sci-hub.cc/' SCHOLARS_BASE_URL = 'https://scholar.google.com/scholar' HEADERS = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0' } AVAILABLE_SCIHUB_BASE_URL = [ 'sci-hub.tw', 'sci-hub.hk', 'sci-hub.la', 'sci-hub.mn', 'sci-hub.name',
from collections import defaultdict from ylib import ylog import re from lib.gftTools import gftIO import os import sys import logging from tqdm import tqdm import time import json import networkx as nx ylog.set_level(logging.DEBUG) ylog.console_on() ylog.filelog_on("cycles") graph = nx.read_gexf('whole_edges.no_loops.gexf') ls_nodes = list(graph.nodes) counter = 0 total_nodes_num = 287966 rm_counter = 0 try: while True: ylog.debug('rm cycles loops number %s' % counter) for node in tqdm(ls_nodes): removed_counter = 0 ylog.debug('rm cycles of node %s' % node) while True: try:
import re from lib.gftTools import gftIO import skill_pb2 import graphUpload_pb2 from tqdm import tqdm import time import hashlib from graph_upload import batch_upload from graph_upload import upload_cat_node from graph_upload import upload_page_node from graph_upload import upload_edge import logging from ylib import ylog ylog.set_level(logging.DEBUG) ylog.console_on() ylog.filelog_on("wiki_upload") # ylog.debug("test") batch_size = 2 # test fetch graph test_url = 'http://192.168.1.166:9080' prod_url = 'http://q.gftchina.com:13567/vqservice/vq/' test_user_name = 'wuwei' test_pwd = 'gft' gs_call = gftIO.GSCall(test_url, test_user_name, test_pwd) try: graph = gftIO.get_graph_from_neo4j('392482970E904D11190D208B7C22874A', server_url=test_url, user_name=test_user_name, pwd=test_pwd) except: pass
from ylib import ylog import logging import os, sys import hashlib from hanziconv import HanziConv from google.protobuf.message import EncodeError from urllib.error import HTTPError from lib.gftTools.gftIO import GSError from google.protobuf.message import DecodeError from http.client import RemoteDisconnected import networkx as nx from collections import defaultdict ylog.set_level(logging.DEBUG) ylog.console_on() ylog.filelog_on('remove_cycles') # batch upload size BATCH_SIZE = 20 # Maximum number of times to retry before giving up. MAX_RETRIES = 10 NODES_FAIL_MAX_RETRIES = 3 # Always retry when these exceptions are raised. RETRIABLE_EXCEPTIONS = (HTTPError, ConnectionResetError, RemoteDisconnected) GRAPH_EXCEPTIONS = (EncodeError, DecodeError) # Always retry when an apiclient.errors.HttpError with one of these status # codes is raised. RETRIABLE_STATUS_CODES = [500, 502, 503, 504, 111] IGNORE_CATEGORIES = [ '使用Catnav的页面', '缺少Wikidata链接的维基共享资源分类', '隐藏分类', '追踪分类', '维基百科特殊页面', '维基百科分类', '维基百科维护', '无需细分的分类', '不要删除的分类', '母分类', '全部重定向分类', '特殊条目' ]