def _get_value(self, json_data, key, default="", convert=True): """ 这里的key应该是规则的key """ value = default if key in json_data: value = json_data[key] elif key in self.reverse_key_map: for alias_key in self.reverse_key_map[key]: if alias_key in json_data: value = json_data[alias_key] match_key = alias_key break ret = Utils.format_value(value, convert) #余下,可以针对特定key的value,做一些归一化处理 if key == "release_date": if type(ret) is not list: ret = ret.replace("00:00:00", "") ret = Utils.format_datetime(ret) else: if len(ret) == 0: return "" ret[0] = ret[0].replace("00:00:00", "") ret[0] = Utils.format_datetime(ret[0]) return ret
class GitSpider: def __init__(self): self.links = _Settings().parse() self.util = Utils() def _get_words(self, url): text = self.util.req(url) if not text: return soup = bs4.BeautifulSoup(text, 'lxml') soup_article = soup.find('article') return soup_article.get_text(' ') if soup_article else None def _save(self, url, words): self.util.checkpath(PATH_DIR) if not words: return title = url.split('/')[-1] with open(PATH_DIR + '{}.txt'.format(title), 'w') as f: f.write(words) def start(self): if not self.links: return for url in self.links: words = self._get_words(url) self._save(url, words) print('successfully get {0} '.format(url))
def __init__(self): # github projects which contain many python directories # 资源集合 self.projectsPool = [ # 'https://github.com/vinta/awesome-python' ] # dependent directories # 独立的仓库 self.projectsUrl = ['https://github.com/zx576/scancode_backend'] # invoke general class # 爬虫工具箱 self.util = Utils()
def _transform_wiley(self, json_data): doi = self._get_value(json_data, "doi") if doi == "": doi = Utils.regex_extract(self._get_value(json_data, "access_url"), ".*onlinelibrary.wiley.com/doi/(.*)") json_data["doi"] = doi return json_data
def _format_author_sup(self, origin_sup): """ Format author sup for example: case 1: origin_sup: a-c,d return: 1,2,3,4 case 2: origin_sup: 1-4 return: 1,2,3,4 """ origin_sup = origin_sup.encode("utf-8").replace(" ", "").replace( "–", "-").strip("").strip(",").replace(",", "-") origin_sup = "".join([ str(ord(sup) - ord('a') + 1) if sup.isalpha() else sup for sup in origin_sup ]) origin_sup = Utils.regex_extract(origin_sup, "\w*(\d+)") if origin_sup == "": origin_sup = '1' sups = origin_sup.split(",") res = [] for sup in sups: if sup.find("-") != -1: elems = sup.split("-") if len(elems) != 2: raise Exception("unexcept sup: %s" % origin_sup) start = int(elems[0]) end = int(elems[1]) res.extend(range(start, end + 1)) else: res.append(sup) ret = ",".join([str(e) for e in res]) return ret
def _transform_sage(self, json_data): #用portia爬取的sage,少了doi doi = self._get_value(json_data, "doi") if doi == "": doi = Utils.regex_extract(self._get_value(json_data, "access_url"), ".*sagepub.com/doi/full/(.*)") json_data["doi"] = doi return json_data
def start(self): self._load_journal_meta() with open(self.origin_meta_file) as f: for line in f: try: line = line.strip() json_data = json.loads(line) except Exception as e: continue if json_data["from_url"] not in self.journal_meta: raise Exception("conference_url %s not excepted" % json_data["from_url"]) journal_meta = self.journal_meta[json_data["from_url"]] new_data = {} new_data["id"] = journal_meta["id"] new_data["conference"] = journal_meta["conference"] new_data["issn"] = json_data.get("issn") new_data["title"] = json_data.get("title") new_data["abstract"] = json_data.get("abstract") new_data["author"] = json_data.get("author") new_data["keywords"] = json_data.get("keywords") new_data["release_year"] = json_data.get("release_year") page = json_data.get("page", "").replace("Pages", "").replace("Page", "").strip() page_infos = page.split("-") if len(page_infos) != 1: start_page = Utils.str_to_num(page_infos[0].strip()) end_page = Utils.str_to_num(page_infos[1].strip()) total_page = end_page - start_page + 1 else: start_page = Utils.str_to_num(page_infos[0].strip()) end_page = start_page total_page = 1 new_data["start_page"] = start_page new_data["end_page"] = end_page new_data["total_page"] = total_page new_data["pdf_path"] = os.path.join( journal_meta["id"], Utils.get_pdf_filename(json_data)) new_data["doi"] = json_data.get("doi") new_data["conference_url"] = json_data["from_url"] new_data["access_url"] = json_data["access_url"] new_data["pdf_url"] = json_data["pdf_url"] print json.dumps(new_data)
def __init__(self): # 手动设置 # topic links self.topic = [ # python topic # 'https://stackoverflow.com/documentation/python/topics' # 'https://stackoverflow.com/documentation/django/topics', # 'https://stackoverflow.com/documentation/algorithm/topics', 'https://stackoverflow.com/documentation/git/topics', # 'https://stackoverflow.com/documentation/design-patterns/topics', # 'https://stackoverflow.com/documentation/flask/topics' ] # question links self.res = [] # ======================= # dont change anything below self.util = Utils() self.domain = 'https://stackoverflow.com'
def _transform_scielo(self, json_data): if type(json_data["date"]) is list: tmp = " ".join(json_data["date"]) else: tmp = json_data["date"] dates = tmp.encode("utf-8").replace("\xa0", " ").replace("\xc2", " ").replace( ".", "").split() if len(dates) == 1: date = dates[0] else: try: date = Utils.format_datetime(" ".join(dates[-2:])) except Exception: try: date = Utils.format_datetime(" ".join(dates[-3:])) except Exception: date = "%s-08-01" % dates[-1] json_data["release_date"] = date return json_data
def __init__(self): # super(_Down, self).__init__() self.util = Utils() # 某文档信息 # self.url = 'https://interactivepython.org/courselib/static/pythonds/index.html' # self.domain = 'https://interactivepython.org/courselib/static/pythonds/' # self.title = 'Problem Solving with Algorithms and Data Structures using Python.txt' # self.url = 'http://chimera.labs.oreilly.com/books/1230000000393/index.html' # self.domain = 'http://chimera.labs.oreilly.com/books/1230000000393/' # self.title = 'Python Cookbook.txt' self.url = 'http://docs.peewee-orm.com/en/stable/' self.domain = self.url self.title = 'peewee.txt'
class Stspider: def __init__(self): self.links = _Settings().parse() self.util = Utils() # 获取所有文字内容 def _get_words(self, url): page = self.util.req(url) if not page: return soup = bs4.BeautifulSoup(page, 'lxml') body = soup.find('body') if not body: return else: words = body.get_text(' ') return words # 保存文字内容 def _save(self, url, words): self.util.checkpath(PATH_DIR) if not words: return title = url.split('/')[-1] with open(PATH_DIR + '{}.txt'.format(title), 'w') as f: f.write(words) # 启动 def start(self): if not self.links: return for url in self.links: words = self._get_words(url) self._save(url, words) print('successfully get {0} '.format(url))
class _Settings(): def __init__(self): # 手动设置 # topic links self.topic = [ # python topic # 'https://stackoverflow.com/documentation/python/topics' # 'https://stackoverflow.com/documentation/django/topics', # 'https://stackoverflow.com/documentation/algorithm/topics', 'https://stackoverflow.com/documentation/git/topics', # 'https://stackoverflow.com/documentation/design-patterns/topics', # 'https://stackoverflow.com/documentation/flask/topics' ] # question links self.res = [] # ======================= # dont change anything below self.util = Utils() self.domain = 'https://stackoverflow.com' # 解析这个 topic 下的所有答案链接 def _parse_topic(self): if not self.topic: return for url in self.topic: self._add_url(url) def _add_url(self, url): page = self.util.req(url) if not page: return soup = bs4.BeautifulSoup(page, 'lxml') soup_a = soup.find_all('a', class_='doc-topic-link') for a in soup_a: last = a.get('href', None) self.res.append(self.domain + last) soup_next = soup.find('a', attrs={'rel': 'next'}) # get next page if soup_next: next_url = self.domain + soup_next['href'] return self._add_url(next_url) def parse(self): self._parse_topic() return self.res
class _Settings: def __init__(self): # github projects which contain many python directories # 资源集合 self.projectsPool = [ # 'https://github.com/vinta/awesome-python' ] # dependent directories # 独立的仓库 self.projectsUrl = ['https://github.com/zx576/scancode_backend'] # invoke general class # 爬虫工具箱 self.util = Utils() # parse projects(like awesome-python) # return all directories' url which domain url are github.com # 解析类似 awesome-python 的项目,返回所有项目的 github 地址,过滤掉指向站外的 url def _parse_pool(self): if not self.projectsPool: return [] links = [] for project in self.projectsPool: page = self.util.req(project) if not page: continue links += self._parse_html_get_links(page) return links # use bs4 parse html # return all links def _parse_html_get_links(self, page): soup = bs4.BeautifulSoup(page, 'lxml') soup_a = soup.find_all('a', href=re.compile('https://github.com/')) links = [] for a in soup_a: links.append(a['href']) return links def parse(self): # deduplicate urls return list(set(self.projectsUrl + self._parse_pool()))
def _init(self): #reverse_key_map记录了,那些规则的key,可能有哪些不规则的表达 self.reverse_key_map = {} for k, v in self.key_map.items(): if v in self.reverse_key_map: self.reverse_key_map[v].append(k) else: self.reverse_key_map[v] = [k] #创建一个workdir workdir = Utils.generate_workdir() self.pass_meta_file = os.path.join( workdir, self.pass_meta_file) #workdir + "\\" + self.pass_meta_file self.bad_meta_file = os.path.join( workdir, self.bad_meta_file) #workdir + "\\" + self.bad_meta_file self.miss_pdf_file = os.path.join( workdir, self.miss_pdf_file) #workdir + "\\" + self.miss_pdf_file self.pass_meta_writer = open(self.pass_meta_file, "w") self.bad_meta_writer = open(self.bad_meta_file, "w") self.miss_pdf_writer = open(self.miss_pdf_file, "w")
def check_miss_journal(self): """ check miss journal, args: python meta_check.py --file xxx --journal_file xxx(xls file) [--source_name wiley] """ required_args = ['file', 'journal_file'] help_message = "python meta_check.py --file xxx --journal_file xxx(xls_file)" \ "[--source_name xxx(f.e: wiley)]" self._verify_args(required_args, help_message) all_journal_meta = Utils.load_journal_meta(self.args.journal_file) should_crawl_journal_meta = {} if self.args.source_name is not None: for journal, meta in all_journal_meta.iteritems(): journal_url = meta['journal_url'].lower() source_name = self.args.source_name.lower() if journal_url.find(source_name) == -1: continue else: should_crawl_journal_meta[journal] = meta else: should_crawl_journal_meta = all_journal_meta crawled_journal = {} with open(self.args.file) as fp: for line in fp: try: json_data = json.loads(line) except Exception as e: continue journal = json_data['journal'].lower() if journal in crawled_journal: crawled_journal[journal] = crawled_journal[journal] + 1 else: crawled_journal[journal] = 1 for journal, meta in should_crawl_journal_meta.iteritems(): if journal not in crawled_journal: print "miss %s, url: %s" % (journal, "%s/issues" % meta['journal_url'])
def __init__(self): self.util = Utils()
def _transform_intechopen(self, json_data): item_type = self._get_value(json_data, "_type") if item_type == "book_item": #book主要是从content里面抽取一些字段出来 content = self._get_value(json_data, "content") #Edited by Aldemaro Romero and Edward O. Keith , ISBN 978-953-51-0844-3, 248 pages, Publisher: InTech, Chapters published November 07, 2012 under CC BY 3.0 license DOI: 10.5772/2731 Edited Volume #Authored by Amira Abdelrasoul, Huu Doan and Ali Lohi , ISBN 978-953-51-3662-0, Print ISBN 978-953-51-3661-3, 232 pages, Publisher: InTech, Chapters published December 06, 2017 under CC BY-NC 4.0 license DOI: 10.5772/65691 Monograph #ISBN 978-953-51-3376-6, Print ISBN 978-953-51-3375-9, 262 pages, Publisher: InTech, Chapters published August 23, 2017 under CC BY 3.0 license DOI: 10.5772/intechopen.68449 Monograph content_regex = re.compile( "(?P<authors>(Edited|Authored) by .*, )?ISBN (?P<isbn>[\w-]+), (?P<print_isbn>Print ISBN .*, )?(?P<pages>\d+) pages, Publisher: (?P<publisher>.*), Chapters published (?P<publish_date>.*) under (?P<license_type>.* license).*" ) match = content_regex.match(content) if not match: print json_data raise Exception("content not match regex: %s" % content) json_data['dc:creater'] = match.group("authors") json_data['eisbn'] = match.group("isbn") json_data['hardcover_PISBN'] = match.group("print_isbn") json_data['page'] = match.group("pages") json_data['publisher'] = match.group("publisher") json_data['release_date'] = Utils.strptime( match.group("publish_date")).strftime("%Y-%m-%d") json_data['license_type'] = match.group("license_type") json_data.pop('chapters', None) json_data.pop('content', None) elif item_type == "chapter_item": #chapter主要是抽取处理一下作者机构 if self._key_exist(json_data, "author_affliication"): #表示 author = self._get_value(json_data, "author") start_chars = "<div class=\"authors-front\">" end_chars = "</div>" author_content = Utils.extract_chars(author, start_chars, end_chars) if author_content == "": #还有获取不到作者的情况.. author_content = self._get_value(json_data, "author_field") #有的作者可能有多个sup的情况,比如<sup>1, </sup><sup>2</sup>,需要归一化 author_content = re.sub("<sup>(\d*)(, ){0,1}</sup>", "<sup>\g<1></sup>", author_content) #有的作者还是以and分隔的 author_content.replace("and", ",") author_elems = author_content.split(",") authors = [] author_sups = [] author_affliication = json_data['author_affliication'] author_affliication = [x for x in author_affliication if \ x.strip().startswith('[')] for author_elem in author_elems: sup_start_chars = "<sup>" sup_end_chars = "</sup>" try: sup_start_index = author_elem.index(sup_start_chars) author_text = author_elem[0:sup_start_index] sup = Utils.extract_chars(author_elem, sup_start_chars, sup_end_chars) except Exception as e: sup = "1" author_text = author_elem if not sup.isdigit(): sup = "1" authors.append(author_text) author_sups.append(sup) json_data = Utils.format_authors(json_data, authors, author_sups, author_affliication) json_data.pop('author_field', None) return json_data
def start(self): #a = "<font size=\"5\"><a name=\"top1\"></a>Efeito de fungicidas na germinação <i>in vitro</i> de conídios de <i>Claviceps africana</i><sup>(<a href=\"#back1\">1</a>)</sup></font>" #self._format_scielo_authors(a) #sys.exit(0) with open(self.meta_file) as f: for line in f: try: self.total = self.total + 1 line = line.strip() json_data = json.loads(line) except Exception as e: self.json_fail = self.json_fail + 1 continue #检查1:access url是最基本的字段了,如果这个都没爬取下来,那连问题都没法定位了 access_url = self._get_value(json_data, "access_url") if access_url == "": self.no_access_url = self.no_access_url + 1 continue #针对不同的平台,对元数据做一些特殊处理 json_data = self._transform(json_data) #检查2:检查是否采集必备字段 miss_required_filed = False for key in self.required_keys: value = self._get_value(json_data, key) if value == "": #value为空,表示元数据未采集到此字段 bad_record = {} bad_record['reason'] = "%s empty" % key bad_record['access_url'] = access_url self._mark_bad_record(bad_record) self.incomplete = self.incomplete + 1 miss_required_filed = True break if miss_required_filed: continue #检查3:检查元数据里面是否有非空字段 fail = False for key, value in json_data.iteritems(): key = key.strip(":").strip() value = Utils.format_value(value) if value == "" and key in self.required_keys: if key == "release_year": publish_data = self._get_value( json_data, "release_date") if publish_data != "": #if publish data is also empty, there is no way to get publish_year json_data["release_year"] = publish_data.split( "-")[0] print "publish year is %s" % json_data[ "release_year"] continue bad_record = {} bad_record['reason'] = "%s empty" % key bad_record['access_url'] = access_url self._mark_bad_record(bad_record) self.incomplete = self.incomplete + 1 fail = True break if fail: continue #检查4:补充一些必备字段 json_data['acquisition_time'] = Utils.current_time() publish_year = self._get_value(json_data, "release_year") if publish_year == "": publish_data = self._get_value(json_data, "release_date") if publish_data != "": json_data["release_year"] = publish_data.split("-")[0] #处理一下author、author_sub、author_affiliation等字段 if access_url in self.pass_meta_map: title = self._get_value(json_data, "title") if title != self.pass_meta_map[access_url]: pass #raise Exception("same url with different title, not gonna happen :%s" % access_url) self.dup = self.dup + 1 continue self.pass_count = self.pass_count + 1 self._mark_success_record(json_data) self.pass_meta_map[access_url] = json_data["title"] if self.args.pdf_dir is not None: print "total: %d, no_access_url: %d, json_fail: %d, incomplete: %d, dup_count: %d, pass_count: %d. pdf_non_exist: %d, pdf_exist_count: %d, pass meta save to: %s, fail meta save to :%s, miss pdf url save to :%s" \ % (self.total, self.no_access_url, self.json_fail, self.incomplete, self.dup, self.pass_count, self.pdf_non_exist, self.pdf_exist, self.pass_meta_file, self.bad_meta_file, self.miss_pdf_file) else: print "total: %d, no_access_url: %d, json_fail: %d, incomplete: %d, dup_count: %d, pass_count: %d. pass meta save to: %s, fail meta save to :%s" \ % (self.total, self.no_access_url, self.json_fail, self.incomplete, self.dup, self.pass_count, self.pass_meta_file, self.bad_meta_file)
def __init__(self): self.links = _Settings().parse() self.util = Utils()
def _mark_success_record(self, json_data): """ Mark an success record. 1. 对Key做一些归一化的工作,同时也可以加一些必备的字段 2. 如果指定了pdf save dir,则会和pdf文件做拼接 3. 去掉一些无用的key(比如portia爬取,会加上_template这样的字段) @param json_data """ publish_data = self._get_value(json_data, "release_date") if "release_year" not in json_data or json_data["release_year"] == "": #if publish data is also empty, there is no way to get publish_year if "release_date" in json_data: json_data["release_year"] = publish_data.split()[-1] if "keywords" in json_data: if type(json_data["keywords"]) is list \ and len(json_data["keywords"]) == 1: #portia有的期刊爬取keywords,都写到一个元素里面了,应该拆开 keywords = json_data["keywords"][0].replace("Keywords", "").strip() json_data["keywords"] = keywords.split(";") if len(json_data["keywords"]) == 1: json_data["keywords"] = keywords.split(",") elif self.for_oa and type(json_data["keywords"]) is not list: keywords = json_data["keywords"].replace( "Index terms:", "").replace("Keywords", "").split(";") json_data["keywords"] = keywords #如果是oa,且keywords不是list,要转化为list convert_data = {} for key, value in json_data.iteritems(): format_key = key.strip(":").strip().lower() if format_key in self.key_map: format_key = self.key_map[format_key] #这类key,不会把list转换为string if format_key in self.reserved_non_converted_keys: convert = False else: convert = not self.for_oa value = self._get_value( json_data, format_key, convert=convert) #这里使用_get_value,会对value做一些归一化 convert_data[format_key] = value # 归一化作者和作者机构 is_scielo = False if is_scielo: #2018.04.18 崩溃了,scielo的作者单独处理 if convert_data["author"][0].find("<") != -1: author_raw_text = " ".join(convert_data["author"]) authors = self._format_scielo_authors(author_raw_text) convert_data['author'] = authors convert_data.pop("author_affiliation", None) elif 'author' in convert_data and len( convert_data["author"] ) == 1: #这种author可能是一坨html,包含了多个作者并且每个作者的sup也标记在<sup>里面 #这种情况属于作者机构不太好爬,直接把html文档都爬取到author字段了 authors, author_sups = self._format_authors(convert_data) if 'author_sup' not in convert_data: convert_data['author_sup'] = author_sups else: convert_data['author_sup'] = [ self._format_author_sup(sup) for sup in convert_data['author_sup'] ] if len(authors) == 1: #如果此时author还是只有一个元素,那么author可能是,分隔的 authors = authors[0].split(",") convert_data['author'] = authors if 'author_sup' in convert_data: convert_data['author_sup'] = [ self._format_author_sup(sup) for sup in convert_data['author_sup'] ] if "author_affiliation" in convert_data and len( convert_data['author_affiliation']) == 1: #这种author_affiliation可能是一坨html author_affiliation = convert_data['author_affiliation'][0] try: authors = convert_data['author'] if author_affiliation.startswith(authors[0]): #这种作者机构是以作者分隔的,比如:https://koedoe.co.za/index.php/koedoe/article/view/188 author_affiliation = self._format_author_affiliations_by_author( author_affiliation, authors) else: author_affiliation = self._format_author_affiliations( convert_data) convert_data['author_affiliation'] = author_affiliation except Exception as e: #没爬到作者,却爬到了作者机构,先忽略这种情况吧 authors = [] convert_data['author_affiliation'] = [] convert_data['author'] = [] convert_data['author_sup'] = [] #有的author_sup里面,会有空字符串,比如scielo if "author_sup" in convert_data and type( convert_data["author_sup"]) is list: convert_data["author_sup"] = [ i for i in convert_data["author_sup"] if i != '' ] if self.args.pdf_dir is not None: filename = Utils.get_pdf_filename(json_data) pdf_path = os.path.join(self.args.pdf_dir, filename + ".pdf") txt_path = os.path.join(self.args.pdf_dir, filename + ".txt") if os.path.exists(pdf_path): convert_data["pdf_path"] = filename + ".pdf" self.pdf_exist = self.pdf_exist + 1 elif os.path.exists(txt_path): convert_data["pdf_path"] = filename + ".txt" self.pdf_exist = self.pdf_exist + 1 else: #print "pdf path(%s) or txt path(%s) not exist" % (pdf_path, txt_path) convert_data["pdf_path"] = "wrong" self.pdf_non_exist = self.pdf_non_exist + 1 pdf_link = self._get_value(json_data, "pdf_url") if pdf_link == "": raise Exception("cannot get pdf_url from json %s" % json_data) self.miss_pdf_writer.write(pdf_link) self.miss_pdf_writer.write("\n") #归一化author,author_affiliation if not self.for_oa: pass #convert_data = Utils.format_authors_from_json(convert_data) #去掉一些key if not self.for_oa: convert_data.pop("author_sup", None) else: #oa的需要特别处理下 convert_data["doi"] = self._get_value(convert_data, "doi").replace( "https://doi.org/", "").replace("http://doi.org/", "") convert_data.pop('_template', None) convert_data_str = json.dumps(convert_data) self.pass_meta_writer.write(convert_data_str) self.pass_meta_writer.write("\n")
class _Down: def __init__(self): self.util = Utils() def _save(self, title, words): self.util.checkpath(PATH_DIR) if not words: return with open(PATH_DIR + title, 'a+') as f: f.write(words) # 递归抓取某文档所有链接 def _download(self, qu, domain, title, switch=True): # print(title) if qu.empty(): return url = qu.get() text = self.util.req(url) if not text: # qu.put(url) return self._download(qu, domain, title, False) if switch: res = self._download_links(domain, text) for i in res: qu.put(i) words = self._download_docs(text) self._save(title, words) return self._download(qu, domain, title, switch=False) def _download_docs(self, page): soup = bs4.BeautifulSoup(page, 'lxml') soup_body = soup.find('body') words = '' if soup_body: words += soup_body.get_text(' ') return words def _download_links(self, domain, page): lst = [] soup = bs4.BeautifulSoup(page, 'lxml') soup_link = soup.find_all('a') for link in soup_link: lst.append(domain + link['href']) return lst def download(self, url, domain, title): # title = 'Problem Solving with Algorithms and Data Structures using Python.pdf' qu = queue.Queue() qu.put(url) return self._download(qu, domain, title)
# coding=utf-8 # author = zhouxin # description # 下载 pdf 文件, 将 pdf 下载地址添加到 downlst ,运行程序即可 import requests from spiders.utils import Utils PATH_DIR = 'download/' util = Utils() def download(url): util.checkpath(PATH_DIR) req = requests.get(url) c = req.content name = url.split('/')[-1] with open(PATH_DIR + name, 'wb') as f: f.write(c) downlst = [ # 'http://files2.syncfusion.com/Downloads/Ebooks/SciPy_Programming_Succinctly.pdf', # 'https://docs.google.com/file/d/0B8IUCMSuNpl7MnpaQ3hhN2R0Z1k/edit' # 'http://stock.ethop.org/pdf/python/Learning%20Python,%205th%20Edition.pdf', # 'http://slav0nic.org.ua/static/books/python/OReilly%20-%20Core%20Python%20Programming.pdf', # /////////// # 'http://www.oreilly.com/programming/free/files/functional-programming-python.pdf', # 'https://doc.lagout.org/programmation/python/Python%20Pocket%20Reference_%20Python%20in%20Your%20Pocket%20%285th%20ed.%29%20%5BLutz%202014-02-09%5D.pdf',
from spiders.utils import Utils filename = sys.argv[1] columnname = sys.argv[2] split = '|' with open(filename) as fp: for line in fp: try: json_date = json.loads(line) except Exception as e: continue columns = columnname.split(",") line = "" for column in columns: try: data = Utils.format_value(json_date[column], join_char='|') except Exception as e: data = "" if column == 'url': data = re.sub("\?journalCode=.*", "", data) if isinstance(data, int): line += str(data) + split else: line += data.replace('\n', '').replace('\t', '').strip() + split #print line.strip().replace(u'ê', 'e').replace(u'é', 'e').replace(u'ã', 'a').replace(u'ó', 'o').replace(u'ú', 'u').strip(split) print line.strip().strip(split)