def getRecentNewsLooply(self, newsType, url): print '-----------------------Begin to crawl------------------------' print '--------------Crawler Mode: RecentNewsCrawler----------------' print '------------------Target news type confirmed-----------------' print '-----------------News Type: ' + newsType + '---------------------' crawlerLoopControlMessage = get_logger('CrawlerControl.log') crawlerLoopControlMessage.info('Begin to crawl') crawlerLoopControlMessage.info('Crawler Mode: RecentNewsCrawler') crawlerLoopControlMessage.info('News type: ' + newsType) count = 1 jsonDictPage = NewsParser().getJsonByRequests(url) # 默认一直抓取新闻 while True: print u'Entering the ', count, u' round crawl' if newsType == 'joke': self.mongoDBStoreroom.storeJokeIntoMongoDB(jsonDictPage) else: self.mongoDBStoreroom.storeRecommendTypedNewsIntoMongoDB( jsonDictPage) nextUrl = url jsonDictPage = NewsParser().getJsonByRequests(nextUrl) print count, ' Json files have been stored' print u'the ', count, u' round ended' count = count + 1 print '-----------------------crawl mission done--------------------------'
def getArticleContent(self, url): article_content = ['-1'] # 初始化,如果html无法打开,beautifulsoup无法读取则可直接返回空值 news_list = [] # 用来存放新闻 try: html = self.htmlGetter.getHtmlByRequests(url) soup = BeautifulSoup(html, 'html.parser') article_content = soup.select('.article-content') except Exception as e: print 'Beautiful Soup load error: ', e s = str(e) getArticleContentError = get_logger('ArticleContentParser.log') getArticleContentError.error = ( 'ArticleContentParser getArticleContentError Beautiful soup load error: ' + s) # 如果前面出现错误直接返回空值 if article_content == ['-1']: return news_list # 判断是不是另一种问答形式文本 if article_content == []: article_content = soup.select('.answer-text-full') """ 由于这种类型的网页不更改header无法直接通过toutiao+/group/id的形式打开,所以解析网页也没有什么意义 if article_content == []: article_content = soup.select('.tt-ignored-node') """ for news in article_content: newsText = news.get_text() news_list.append(newsText) return news_list
def storeJokeIntoMongoDB(self, jsonDict): """ 储存TouTiao_JOKE实时新闻 """ jsonDataDict = jsonDict['data'] """ 将获取json文件的新闻装入collection """ count = 1 for newsObject in jsonDataDict: print u'Begin to get the ', count, u' news' count = count + 1 # 判断数据库是否已经存在这个新闻 if JokeNewsEngine.check_joke_obj(newsObject): return True break try: JokeNewsEngine.create_joke_obj(newsObject) except Exception as e: print 'store JOKE into MongoDB error: ', e s = str(e) storeJokeIntoMongoDBErrorMessage = get_logger( 'MongoDBStoreroom.log') storeJokeIntoMongoDBErrorMessage.error( 'MongoDBStoreroom storeJokeIntoMongoDB error: ' + s) return False
def getJsonByRequests(self, url): # cookie = self.getCookie() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Host': "www.toutiao.com", 'Referer': "http://www.toutiao.com/", 'X-Requested-With': "XMLHttpRequest", 'Cookie': "csrftoken=4c9d92dba03a267de6bdb8d8d2a59802; tt_webid=56609842670; uuid='w:ccb829fa9020459cbe1024b623988c74'; _ga=GA1.2.188412771.1489068623; CNZZDATA1259612802=794809316-1489067802-%7C1489229849; UM_distinctid=15abc5cd6e92d2-0ff005b574fc54-1262694a-e1000-15abc5cd6ea151; utm_source=toutiao; __tasessionId=gkvfupjml1489230426764" } try: request = urllib2.Request(url, headers=headers) page = urllib2.urlopen(request).read() data = json.loads(page) except Exception as e: print 'getJsonError: ', e s = str(e) getJsonByRequestsError = get_logger('NewsParserError.log') getJsonByRequestsError.error( 'getJsonByRequestsError getJsonError: ' + s) return data
def connectToMongoDB(self): try: connect(self.dataBase, host=self.host, port=self.port) except Exception as e: print 'connect to MongoDB error', e s = str(e) MongoDBControllerConnectionErrorMessage = get_logger('MongoDBController.log') MongoDBControllerConnectionErrorMessage.error('mongoDB connection error: ' + s)
def create_regular_obj(cls, ori_data): try: cls(group_id=ori_data['group_id'], news_tag=ori_data['tag'], comment_count=ori_data['comments_count'], behot_time=ori_data['behot_time'], ori_data=ori_data).save() except Exception as e: print "news object save error: ", e s = str(e) mongoDBControllerErrorMessage = get_logger('mongoDBController.log') mongoDBControllerErrorMessage.error('recommend-typed news object save error: ' + s)
def create_joke_obj(cls, ori_data): try: cls(id_str=ori_data['group']['id_str'], favorite_count=ori_data['group']['favorite_count'], comment_count=ori_data['group']['comment_count'], go_detail_count=ori_data['group']['go_detail_count'], share_count=ori_data['group']['share_count'], bury_count=ori_data['group']['bury_count'], digg_count=ori_data['group']['digg_count'], online_time=ori_data['online_time'], ori_data=ori_data).save() except Exception as e: print "news object save error: ", e s = str(e) mongoDBControllerErrorMessage = get_logger('mongoDBController.log') mongoDBControllerErrorMessage.error('joke news object save error: ' + s)
def getTimeStampNewsLooply(self, hot_timeStamp, newsType, url): """ 根据时间戳判断是否一直获取到目标时间点新闻 """ print '-----------------------Begin to crawl------------------------' print '----------------Crawler Mode: TimeStampCrawler---------------' print '------------------Target timestamp confirmed-----------------' print '---------------------' + hot_timeStamp + '--------------------' print '------------------Target news type confirmed-----------------' print '-----------------News Type: ' + newsType + '---------------------' crawlerLoopControlMessage = get_logger('CrawlerControl.log') crawlerLoopControlMessage.info('Begin to crawl') crawlerLoopControlMessage.info('Crawler Mode: TimeStampCrawler') crawlerLoopControlMessage.info('Target timestamp: ' + hot_timeStamp) crawlerLoopControlMessage.info('News type: ' + newsType) count = 1 nextNewsType = newsType # 继续爬取的新闻类型 targetTimeStamp = TouTiaoNewsURL.dateTimeToTimeStamp( hot_timeStamp) # 将目标日期转换为时间戳str intTargetTimeStamp = int(targetTimeStamp) # str--->int jsonDictPage = NewsParser().getJsonByRequests(url) # 获取json文件 nextJsonPage = jsonDictPage['next']['max_behot_time'] # 获取下一个json文件时间戳 print 'target time stamp is: ', intTargetTimeStamp # 一直抓取json文件直到到达目标时间戳 while intTargetTimeStamp < nextJsonPage: print u'Entering the', count, u' round crawl' if newsType == 'joke': flag = self.mongoDBStoreroom.storeJokeIntoMongoDB(jsonDictPage) else: flag = self.mongoDBStoreroom.storeRecommendTypedNewsIntoMongoDB( jsonDictPage) # 增量更新标志,如果flag=true说明在这之前的新闻已经爬取过任务可以终止了 if flag: break nextUrl = TouTiaoNewsURL.getTargetURL(nextNewsType, str(nextJsonPage)) jsonDictPage = NewsParser().getJsonByRequests(nextUrl) nextJsonPage = jsonDictPage['next']['max_behot_time'] print 'json file time stamp is: ', nextJsonPage print count, ' Json files have been stored' print u'the ', count, u' round ended' count = count + 1 print '-------------------target time stamp has reached-------------------' print '------------------------crawl mission done-------------------------'
import numpy import pysam import collections import os.path import basics_nuc_seq as bns from multiprocessing import Pool import argparse import os from mylog import get_logger import pandas as pd from utils import simplify_path logger = get_logger(__file__, __name__) # TO IMPROVE # Make Fastq a proper iterator object (__iter__, next etc...) ################################################################################ # OO VERSION class Fastq(object): def __init__(self, path): self.path = path self.name = simplify_path(path) def __repr__(self): return '<Fastq object: {}>'.format(self.path) def iterate_fastq(self): """ Iterate through a fastq file """
c = 0 add_number_max = int(get_random_configuration("number", "number")) print(datetime.datetime.now(), int(get_random_configuration("sleep", "round_sleep")), "休息。。。。") time.sleep(int(get_random_configuration("sleep", "round_sleep"))) ys = next(my_ok_yeshens) if is_multi: pyautogui.click(yeshen_position) time.sleep(1) pyautogui.click(ys['position']) if __name__ == '__main__': print("休息10s后即将启动") print("author", "Patrick") time.sleep(10) log = get_logger(filename="added.log") my_yeshens = get_init_yeshens() my_ok_yeshens = choose_yeshen(my_yeshens) source = AcctSource.read_source() try: main(my_ok_yeshens) except KeyboardInterrupt: pass finally: print(source) AcctSource.close_source(source) print("程序结束")
from mylog import get_logger logger = get_logger(__name__) def f(): logger.bind(username="******").info("hello")
# A library to detect divergent transcripttion import logging from mylog import get_logger import inspect import os import subprocess logger = get_logger('', __name__, logging.DEBUG) def bam_filtering(bam): """ Filters bam file based on mapping, mapping of mate, splicing """ filt_bam = os.path.splitext(bam)[0] + "_filt.bam" # Filtering bam, removing unmapped, mate unmapped, spliced, secondary mapping cmd = "samtools view -h -F 0x4 -F 0x8 -F 0x100 {0} | awk '{{if ($1 ~ /^@/) {{print}} else if ($6 !~ /N/) {{print}}}}' | samtools view -bh > {1}".format( bam, filt_bam) subprocess.check_output(cmd, shell=True) logger.debug('DONE: {}'.format(cmd)) return filt_bam def bam_to_fragments(bam): """From mapped reads in a bam, get bed intervals of the pairs joined into fragments """
def storeRecommendTypedNewsIntoMongoDB(self, jsonDict): """ 储存TouTiao_RECOMMEND等结构相似的实时新闻 """ jsonDataDict = jsonDict['data'] articleContentParser = ArticleContentParser() # 用来解析网页正文 pattern = re.compile(r'http') # 用来判断网页是否能被打开解析 """ 将获取json文件的新闻装入collection """ count = 1 for newsObject in jsonDataDict: print u'Begin to get the ', count, u' news' # 提取第一个新闻的时间戳作为这个新闻包的时间戳 if count == 1: newsTime = newsObject.get('behot_time') count = count + 1 # 判断数据库是否已经存在这个新闻 if (RecommendTypedNewsEngine.check_regular_obj(newsObject)): return True break try: # 判断获取的json文件正文能否被解析 if newsObject.get('single_mode') is True: temURL = newsObject.get('source_url') """ 此处的source_url不一定是头条自己的网址,可能是广告或者是其他新闻网站的地址 所以在任务执行过程中会出现html无法获取导致的getHTMLerror和Mongodb无法储存的错误 """ if pattern.match(temURL) is None: targetURL = 'http://www.toutiao.com' + temURL # 可被解析,则获取json文件里新闻的源网页url并进行解析 article_content = articleContentParser.getArticleContent( targetURL) else: article_content = [] else: article_content = [] """ 此处没改为存储段子的形式,因为要提取新闻的内容,所以要给每个新闻对象创建一个'article_content'的属性,暂时还不清楚怎么添加所以依旧采用原来的形式。 如果能够给对象添加'article_content'这个属性,那么只用修改MongoDBController.py中的类RecommendTypedNewsEngine的create_regular_obj(), 在Create_regular_obj()函数中提取对象内容加上['group'][]即可 """ in_database = { 'chinese_tag': newsObject.get('chinese_tag'), 'media_avatar_url': newsObject.get('media_avatar_url'), 'tag_url': newsObject.get('news_entertainment'), 'title': newsObject.get('title'), 'abstract': newsObject.get('abstract'), 'gallary_image_count': newsObject.get('gallary_image_count'), 'image_list': newsObject.get('image_list'), 'behot_time': newsObject.get('behot_time'), 'source_url': newsObject.get('source_url'), 'source': newsObject.get('source'), 'more_mode': newsObject.get('more_mode'), 'single_mode': newsObject.get('single_mode'), 'middle_mode': newsObject.get('middle_mode'), 'article_genre': newsObject.get('article_genre'), 'comments_count': newsObject.get('comments_count'), 'has_gallery': newsObject.get('has_gallery'), 'tag': newsObject.get('tag'), 'image_url': newsObject.get('image_url'), 'group_id': newsObject.get('group_id'), 'article_content': article_content } # 将新闻保存在数据库中 RecommendTypedNewsEngine.create_regular_obj(in_database) """ 然后这个部分把提取到的新闻image url的图片下载保存到oss云端 """ ossStore = OssStore() image_list = newsObject.get('image_list') newsType = newsObject.get('tag') newsTitle = newsObject.get('title') image_url = newsObject.get('image_url') if image_list: imageCount = 0 for url in image_list: imageUrl = url.get('url') ossStore.save_image_in_oss(imageUrl, newsType, newsTime, newsTitle, imageCount) imageCount = imageCount + 1 else: if image_url: ossStore.save_image_in_oss(image_url, newsType, newsTime, newsTitle, 0) except Exception as e: print 'store Recommend-typed news into MongoDB error: ', e s = str(e) storeRecommendTypedNewsIntoMongoDBErrorMessage = get_logger( 'MongoDBStoreroom.log') storeRecommendTypedNewsIntoMongoDBErrorMessage.error( 'MongoDBStoreroom storeRecommendTypedNewsIntoMongoDB error: ' + s) return False
# | CR_Inp_3.bam | CR_Inp | # | CR_Inp_nas_1.bam | CR_Inp_nas | # | CR_Inp_nas_2.bam | CR_Inp_nas | # | CR_Inp_nas_3.bam | CR_Inp_nas | # TO ADD # MAKE BAM INDEX FIRST (ALWAYS FORGETTING ABOUT THOSE import os import argparse import subprocess import itertools from mylog import get_logger import logging logger = get_logger(__file__, __name__, log_level=logging.DEBUG) def parse_input(input_tab): """Parse input for majiq. Input should have: $1: file paths $2: group name """ inp = {} groups = set() with open(input_tab, 'r') as f: for line in f: if line.startswith('#'): continue file_path = line.split()[0]