Ejemplo n.º 1
0
from ylib.yaml_config import Configuraion
from urllib.parse import quote_plus
config = Configuraion()

config.load('/home/weiwu/projects/deep_learning/web_crawl/config.yaml')
USER_AGENT = config.USER_AGENT
DOMAIN = config.DOMAIN
BLACK_DOMAIN = config.BLACK_DOMAIN
URL_SEARCH = config.URL_GOOGLE_SEARCH
PROXIES = config.PROXIES
URL_SEARCH = config.URL_GOOGLE_SCHOLAR
URL_NEXT = config.URL_GOOGLE_SCHOLAR_NEXT

ylog.set_level(logging.DEBUG)
ylog.console_on()
ylog.filelog_on("app")

# log config
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logger = logging.getLogger('Sci-Hub')
logger.setLevel(logging.DEBUG)

# constants
SCIHUB_BASE_URL = 'http://sci-hub.cc/'
SCHOLARS_BASE_URL = 'https://scholar.google.com/scholar'
HEADERS = {
    'User-Agent':
    'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'
}
AVAILABLE_SCIHUB_BASE_URL = [
    'sci-hub.tw', 'sci-hub.hk', 'sci-hub.la', 'sci-hub.mn', 'sci-hub.name',
Ejemplo n.º 2
0
from collections import defaultdict
from ylib import ylog
import re
from lib.gftTools import gftIO
import os
import sys
import logging
from tqdm import tqdm
import time
import json
import networkx as nx

ylog.set_level(logging.DEBUG)
ylog.console_on()
ylog.filelog_on("cycles")
graph = nx.read_gexf('whole_edges.no_loops.gexf')
ls_nodes = list(graph.nodes)
counter = 0
total_nodes_num = 287966
rm_counter = 0
try:
    while True:
        ylog.debug('rm cycles loops number %s' % counter)

        for node in tqdm(ls_nodes):
            removed_counter = 0
            ylog.debug('rm cycles of node %s' % node)

            while True:
                try:
Ejemplo n.º 3
0
import re
from lib.gftTools import gftIO
import skill_pb2
import graphUpload_pb2
from tqdm import tqdm
import time
import hashlib
from graph_upload import batch_upload
from graph_upload import upload_cat_node
from graph_upload import upload_page_node
from graph_upload import upload_edge
import logging
from ylib import ylog
ylog.set_level(logging.DEBUG)
ylog.console_on()
ylog.filelog_on("wiki_upload")
# ylog.debug("test")
batch_size = 2
# test fetch graph
test_url = 'http://192.168.1.166:9080'
prod_url = 'http://q.gftchina.com:13567/vqservice/vq/'
test_user_name = 'wuwei'
test_pwd = 'gft'
gs_call = gftIO.GSCall(test_url, test_user_name, test_pwd)
try:
    graph = gftIO.get_graph_from_neo4j('392482970E904D11190D208B7C22874A',
                                       server_url=test_url,
                                       user_name=test_user_name,
                                       pwd=test_pwd)
except:
    pass
Ejemplo n.º 4
0
from ylib import ylog
import logging
import os, sys
import hashlib
from hanziconv import HanziConv
from google.protobuf.message import EncodeError
from urllib.error import HTTPError
from lib.gftTools.gftIO import GSError
from google.protobuf.message import DecodeError
from http.client import RemoteDisconnected
import networkx as nx
from collections import defaultdict

ylog.set_level(logging.DEBUG)
ylog.console_on()
ylog.filelog_on('remove_cycles')
# batch upload size
BATCH_SIZE = 20
# Maximum number of times to retry before giving up.
MAX_RETRIES = 10
NODES_FAIL_MAX_RETRIES = 3
# Always retry when these exceptions are raised.
RETRIABLE_EXCEPTIONS = (HTTPError, ConnectionResetError, RemoteDisconnected)
GRAPH_EXCEPTIONS = (EncodeError, DecodeError)
# Always retry when an apiclient.errors.HttpError with one of these status
# codes is raised.
RETRIABLE_STATUS_CODES = [500, 502, 503, 504, 111]
IGNORE_CATEGORIES = [
    '使用Catnav的页面', '缺少Wikidata链接的维基共享资源分类', '隐藏分类', '追踪分类', '维基百科特殊页面',
    '维基百科分类', '维基百科维护', '无需细分的分类', '不要删除的分类', '母分类', '全部重定向分类', '特殊条目'
]