Ejemplo n.º 1
0
    def getRecentNewsLooply(self, newsType, url):
        print '-----------------------Begin to crawl------------------------'
        print '--------------Crawler Mode: RecentNewsCrawler----------------'
        print '------------------Target news type confirmed-----------------'
        print '-----------------News Type: ' + newsType + '---------------------'
        crawlerLoopControlMessage = get_logger('CrawlerControl.log')
        crawlerLoopControlMessage.info('Begin to crawl')
        crawlerLoopControlMessage.info('Crawler Mode: RecentNewsCrawler')
        crawlerLoopControlMessage.info('News type: ' + newsType)

        count = 1
        jsonDictPage = NewsParser().getJsonByRequests(url)
        # 默认一直抓取新闻
        while True:
            print u'Entering the ', count, u' round crawl'
            if newsType == 'joke':
                self.mongoDBStoreroom.storeJokeIntoMongoDB(jsonDictPage)
            else:
                self.mongoDBStoreroom.storeRecommendTypedNewsIntoMongoDB(
                    jsonDictPage)

            nextUrl = url
            jsonDictPage = NewsParser().getJsonByRequests(nextUrl)
            print count, ' Json files have been stored'
            print u'the ', count, u' round ended'
            count = count + 1

        print '-----------------------crawl mission done--------------------------'
    def getArticleContent(self, url):
        article_content = ['-1']  # 初始化,如果html无法打开,beautifulsoup无法读取则可直接返回空值
        news_list = []  # 用来存放新闻

        try:
            html = self.htmlGetter.getHtmlByRequests(url)
            soup = BeautifulSoup(html, 'html.parser')
            article_content = soup.select('.article-content')

        except Exception as e:
            print 'Beautiful Soup load error: ', e
            s = str(e)
            getArticleContentError = get_logger('ArticleContentParser.log')
            getArticleContentError.error = (
                'ArticleContentParser getArticleContentError Beautiful soup load error: '
                + s)

        # 如果前面出现错误直接返回空值
        if article_content == ['-1']:
            return news_list
        # 判断是不是另一种问答形式文本
        if article_content == []:
            article_content = soup.select('.answer-text-full')
            """
            由于这种类型的网页不更改header无法直接通过toutiao+/group/id的形式打开,所以解析网页也没有什么意义
            if article_content == []:
                article_content = soup.select('.tt-ignored-node')
            """

        for news in article_content:
            newsText = news.get_text()
            news_list.append(newsText)

        return news_list
Ejemplo n.º 3
0
    def storeJokeIntoMongoDB(self, jsonDict):
        """
        储存TouTiao_JOKE实时新闻
        """
        jsonDataDict = jsonDict['data']
        """
        将获取json文件的新闻装入collection
        """
        count = 1
        for newsObject in jsonDataDict:
            print u'Begin to get the ', count, u' news'
            count = count + 1
            # 判断数据库是否已经存在这个新闻
            if JokeNewsEngine.check_joke_obj(newsObject):
                return True
                break

            try:
                JokeNewsEngine.create_joke_obj(newsObject)

            except Exception as e:
                print 'store JOKE into MongoDB error: ', e
                s = str(e)
                storeJokeIntoMongoDBErrorMessage = get_logger(
                    'MongoDBStoreroom.log')
                storeJokeIntoMongoDBErrorMessage.error(
                    'MongoDBStoreroom storeJokeIntoMongoDB error: ' + s)

        return False
Ejemplo n.º 4
0
    def getJsonByRequests(self, url):
        # cookie = self.getCookie()
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
            'Host':
            "www.toutiao.com",
            'Referer':
            "http://www.toutiao.com/",
            'X-Requested-With':
            "XMLHttpRequest",
            'Cookie':
            "csrftoken=4c9d92dba03a267de6bdb8d8d2a59802; tt_webid=56609842670; uuid='w:ccb829fa9020459cbe1024b623988c74'; _ga=GA1.2.188412771.1489068623; CNZZDATA1259612802=794809316-1489067802-%7C1489229849; UM_distinctid=15abc5cd6e92d2-0ff005b574fc54-1262694a-e1000-15abc5cd6ea151; utm_source=toutiao; __tasessionId=gkvfupjml1489230426764"
        }

        try:
            request = urllib2.Request(url, headers=headers)
            page = urllib2.urlopen(request).read()
            data = json.loads(page)

        except Exception as e:
            print 'getJsonError: ', e
            s = str(e)
            getJsonByRequestsError = get_logger('NewsParserError.log')
            getJsonByRequestsError.error(
                'getJsonByRequestsError getJsonError: ' + s)

        return data
Ejemplo n.º 5
0
    def connectToMongoDB(self):
        try:
            connect(self.dataBase, host=self.host, port=self.port)

        except Exception as e:
            print 'connect to MongoDB error', e
            s = str(e)
            MongoDBControllerConnectionErrorMessage = get_logger('MongoDBController.log')
            MongoDBControllerConnectionErrorMessage.error('mongoDB connection error: ' + s)
Ejemplo n.º 6
0
    def create_regular_obj(cls, ori_data):
        try:
            cls(group_id=ori_data['group_id'], news_tag=ori_data['tag'], comment_count=ori_data['comments_count'], behot_time=ori_data['behot_time'], ori_data=ori_data).save()

        except Exception as e:
            print "news object save error: ", e
            s = str(e)
            mongoDBControllerErrorMessage = get_logger('mongoDBController.log')
            mongoDBControllerErrorMessage.error('recommend-typed news object save error: ' + s)
Ejemplo n.º 7
0
    def create_joke_obj(cls, ori_data):
        try:
            cls(id_str=ori_data['group']['id_str'], favorite_count=ori_data['group']['favorite_count'], comment_count=ori_data['group']['comment_count'], go_detail_count=ori_data['group']['go_detail_count'],
                share_count=ori_data['group']['share_count'], bury_count=ori_data['group']['bury_count'], digg_count=ori_data['group']['digg_count'], online_time=ori_data['online_time'], ori_data=ori_data).save()

        except Exception as e:
            print "news object save error: ", e
            s = str(e)
            mongoDBControllerErrorMessage = get_logger('mongoDBController.log')
            mongoDBControllerErrorMessage.error('joke news object save error: ' + s)
Ejemplo n.º 8
0
    def getTimeStampNewsLooply(self, hot_timeStamp, newsType, url):
        """
        根据时间戳判断是否一直获取到目标时间点新闻
        """
        print '-----------------------Begin to crawl------------------------'
        print '----------------Crawler Mode: TimeStampCrawler---------------'
        print '------------------Target timestamp confirmed-----------------'
        print '---------------------' + hot_timeStamp + '--------------------'
        print '------------------Target news type confirmed-----------------'
        print '-----------------News Type: ' + newsType + '---------------------'
        crawlerLoopControlMessage = get_logger('CrawlerControl.log')
        crawlerLoopControlMessage.info('Begin to crawl')
        crawlerLoopControlMessage.info('Crawler Mode: TimeStampCrawler')
        crawlerLoopControlMessage.info('Target timestamp: ' + hot_timeStamp)
        crawlerLoopControlMessage.info('News type: ' + newsType)

        count = 1
        nextNewsType = newsType  # 继续爬取的新闻类型
        targetTimeStamp = TouTiaoNewsURL.dateTimeToTimeStamp(
            hot_timeStamp)  # 将目标日期转换为时间戳str
        intTargetTimeStamp = int(targetTimeStamp)  # str--->int
        jsonDictPage = NewsParser().getJsonByRequests(url)  # 获取json文件
        nextJsonPage = jsonDictPage['next']['max_behot_time']  # 获取下一个json文件时间戳
        print 'target time stamp is: ', intTargetTimeStamp
        # 一直抓取json文件直到到达目标时间戳
        while intTargetTimeStamp < nextJsonPage:
            print u'Entering the', count, u' round crawl'
            if newsType == 'joke':
                flag = self.mongoDBStoreroom.storeJokeIntoMongoDB(jsonDictPage)
            else:
                flag = self.mongoDBStoreroom.storeRecommendTypedNewsIntoMongoDB(
                    jsonDictPage)

            # 增量更新标志,如果flag=true说明在这之前的新闻已经爬取过任务可以终止了
            if flag:
                break

            nextUrl = TouTiaoNewsURL.getTargetURL(nextNewsType,
                                                  str(nextJsonPage))
            jsonDictPage = NewsParser().getJsonByRequests(nextUrl)
            nextJsonPage = jsonDictPage['next']['max_behot_time']
            print 'json file time stamp is: ', nextJsonPage
            print count, ' Json files have been stored'
            print u'the ', count, u' round ended'
            count = count + 1

        print '-------------------target time stamp has reached-------------------'
        print '------------------------crawl mission done-------------------------'
Ejemplo n.º 9
0
import numpy
import pysam
import collections
import os.path
import basics_nuc_seq as bns
from multiprocessing import Pool
import argparse
import os
from mylog import get_logger
import pandas as pd
from utils import simplify_path

logger = get_logger(__file__, __name__)

# TO IMPROVE
# Make Fastq a proper iterator object (__iter__, next etc...)

################################################################################
# OO VERSION


class Fastq(object):
    def __init__(self, path):
        self.path = path
        self.name = simplify_path(path)

    def __repr__(self):
        return '<Fastq object: {}>'.format(self.path)

    def iterate_fastq(self):
        """ Iterate through a fastq file """
Ejemplo n.º 10
0
            c = 0
            add_number_max = int(get_random_configuration("number", "number"))
            print(datetime.datetime.now(),
                  int(get_random_configuration("sleep", "round_sleep")),
                  "休息。。。。")
            time.sleep(int(get_random_configuration("sleep", "round_sleep")))
            ys = next(my_ok_yeshens)
            if is_multi:
                pyautogui.click(yeshen_position)
                time.sleep(1)
            pyautogui.click(ys['position'])


if __name__ == '__main__':
    print("休息10s后即将启动")
    print("author", "Patrick")
    time.sleep(10)
    log = get_logger(filename="added.log")
    my_yeshens = get_init_yeshens()
    my_ok_yeshens = choose_yeshen(my_yeshens)
    source = AcctSource.read_source()

    try:
        main(my_ok_yeshens)
    except KeyboardInterrupt:
        pass
    finally:
        print(source)
        AcctSource.close_source(source)
        print("程序结束")
Ejemplo n.º 11
0
from mylog import get_logger
logger = get_logger(__name__)


def f():
    logger.bind(username="******").info("hello")
Ejemplo n.º 12
0
# A library to detect divergent transcripttion

import logging
from mylog import get_logger
import inspect
import os
import subprocess

logger = get_logger('', __name__, logging.DEBUG)


def bam_filtering(bam):
    """ Filters bam file based on mapping, mapping of mate, splicing
    """

    filt_bam = os.path.splitext(bam)[0] + "_filt.bam"

    # Filtering bam, removing unmapped, mate unmapped, spliced, secondary mapping
    cmd = "samtools view -h -F 0x4 -F 0x8 -F 0x100 {0} | awk '{{if ($1 ~ /^@/) {{print}} else if ($6 !~ /N/) {{print}}}}' | samtools view -bh > {1}".format(
        bam, filt_bam)

    subprocess.check_output(cmd, shell=True)
    logger.debug('DONE: {}'.format(cmd))

    return filt_bam


def bam_to_fragments(bam):
    """From mapped reads in a bam, get bed intervals of the pairs joined
    into fragments
    """
Ejemplo n.º 13
0
    def storeRecommendTypedNewsIntoMongoDB(self, jsonDict):
        """
        储存TouTiao_RECOMMEND等结构相似的实时新闻
        """
        jsonDataDict = jsonDict['data']
        articleContentParser = ArticleContentParser()  # 用来解析网页正文
        pattern = re.compile(r'http')  # 用来判断网页是否能被打开解析
        """
        将获取json文件的新闻装入collection
        """
        count = 1
        for newsObject in jsonDataDict:
            print u'Begin to get the ', count, u' news'
            # 提取第一个新闻的时间戳作为这个新闻包的时间戳
            if count == 1:
                newsTime = newsObject.get('behot_time')
            count = count + 1
            # 判断数据库是否已经存在这个新闻
            if (RecommendTypedNewsEngine.check_regular_obj(newsObject)):
                return True
                break

            try:
                # 判断获取的json文件正文能否被解析
                if newsObject.get('single_mode') is True:
                    temURL = newsObject.get('source_url')
                    """
                    此处的source_url不一定是头条自己的网址,可能是广告或者是其他新闻网站的地址
                    所以在任务执行过程中会出现html无法获取导致的getHTMLerror和Mongodb无法储存的错误
                    """
                    if pattern.match(temURL) is None:
                        targetURL = 'http://www.toutiao.com' + temURL
                        # 可被解析,则获取json文件里新闻的源网页url并进行解析
                        article_content = articleContentParser.getArticleContent(
                            targetURL)
                    else:
                        article_content = []
                else:
                    article_content = []
                """
                此处没改为存储段子的形式,因为要提取新闻的内容,所以要给每个新闻对象创建一个'article_content'的属性,暂时还不清楚怎么添加所以依旧采用原来的形式。
                如果能够给对象添加'article_content'这个属性,那么只用修改MongoDBController.py中的类RecommendTypedNewsEngine的create_regular_obj(),
                在Create_regular_obj()函数中提取对象内容加上['group'][]即可
                """
                in_database = {
                    'chinese_tag': newsObject.get('chinese_tag'),
                    'media_avatar_url': newsObject.get('media_avatar_url'),
                    'tag_url': newsObject.get('news_entertainment'),
                    'title': newsObject.get('title'),
                    'abstract': newsObject.get('abstract'),
                    'gallary_image_count':
                    newsObject.get('gallary_image_count'),
                    'image_list': newsObject.get('image_list'),
                    'behot_time': newsObject.get('behot_time'),
                    'source_url': newsObject.get('source_url'),
                    'source': newsObject.get('source'),
                    'more_mode': newsObject.get('more_mode'),
                    'single_mode': newsObject.get('single_mode'),
                    'middle_mode': newsObject.get('middle_mode'),
                    'article_genre': newsObject.get('article_genre'),
                    'comments_count': newsObject.get('comments_count'),
                    'has_gallery': newsObject.get('has_gallery'),
                    'tag': newsObject.get('tag'),
                    'image_url': newsObject.get('image_url'),
                    'group_id': newsObject.get('group_id'),
                    'article_content': article_content
                }
                # 将新闻保存在数据库中
                RecommendTypedNewsEngine.create_regular_obj(in_database)
                """
                然后这个部分把提取到的新闻image url的图片下载保存到oss云端
                """
                ossStore = OssStore()
                image_list = newsObject.get('image_list')
                newsType = newsObject.get('tag')
                newsTitle = newsObject.get('title')
                image_url = newsObject.get('image_url')
                if image_list:
                    imageCount = 0
                    for url in image_list:
                        imageUrl = url.get('url')
                        ossStore.save_image_in_oss(imageUrl, newsType,
                                                   newsTime, newsTitle,
                                                   imageCount)
                        imageCount = imageCount + 1
                else:
                    if image_url:
                        ossStore.save_image_in_oss(image_url, newsType,
                                                   newsTime, newsTitle, 0)

            except Exception as e:
                print 'store Recommend-typed news into MongoDB error: ', e
                s = str(e)
                storeRecommendTypedNewsIntoMongoDBErrorMessage = get_logger(
                    'MongoDBStoreroom.log')
                storeRecommendTypedNewsIntoMongoDBErrorMessage.error(
                    'MongoDBStoreroom storeRecommendTypedNewsIntoMongoDB error: '
                    + s)

        return False
Ejemplo n.º 14
0
# | CR_Inp_3.bam     | CR_Inp     |
# | CR_Inp_nas_1.bam | CR_Inp_nas |
# | CR_Inp_nas_2.bam | CR_Inp_nas |
# | CR_Inp_nas_3.bam | CR_Inp_nas |

# TO ADD
# MAKE BAM INDEX FIRST (ALWAYS FORGETTING ABOUT THOSE

import os
import argparse
import subprocess
import itertools
from mylog import get_logger
import logging

logger = get_logger(__file__, __name__, log_level=logging.DEBUG)


def parse_input(input_tab):
    """Parse input for majiq. Input should have:
    $1: file paths
    $2: group name
    """
    inp = {}
    groups = set()
    with open(input_tab, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
        
            file_path = line.split()[0]