예제 #1
0
 def __init__(self):
     try:
         if self._lock is None:
             self._lock = threading.Lock()
     except IOError:
         logger.error('Error')
         raise NotImplementedError('Error')
예제 #2
0
def get_parent(raw, comments):
    """Return parent of a comment.

    Parameters
    ----------
    raw: str
        The raw text of the comment message for which the parent is to be
        identified.
    comments: list
        A list of instances of app.models.Comment representing the comments
        that have already been processed. The list is used to look for the
        parent of the comment represented by the raw text argument.

    Returns
    -------
    comment: object
        An instance of app.models.Comment that represents the parent of the
        comment specified using |raw|. None is returned if no parent was found.
    """
    match = RESPONSE_HEAD_RE.match(raw)
    if match is not None:
        # Extract date and time and author from comment reponse header
        header = match.group(0)

        match = DATE_TIME_RE.search(header)
        if match is None:
            logger.error('NONE: ' + raw)
            return None
        components = match.groupdict()
        timestamp = '{date} {time}'.format(date=components['date'].replace(
            '/', '-'),
                                           time=components['time'])

        match = AUTHOR_RE.search(header)
        author = match.group(1)

        parents = list()
        for comment in comments:
            # Compare timestamps without milliseconds
            ind = len(comment.posted)
            if '.' in comment.posted:
                ind = comment.posted.index('.')
            if comment.posted[:ind] == timestamp:
                parents.append(comment)

        if len(parents) == 1:
            # Only one parent found by timestamp matching. Return it.
            return parents[0]
        else:
            # Multiple parents found by timestamp matching. Use full text.
            for comment in comments:
                text = '> {}'.format(comment.text.replace(r'\n', r'\n> '))
                if text in raw:
                    return comment
    return None if len(comments) == 0 else comments[-1]
예제 #3
0
    def __init__(self):
        try:
            if self._lock is None:
                self._lock = threading.Lock()
            with self._lock:
                if app.exists_output_fasttext_model():
                    self._model = ft.load_model(
                        app.get_output_fasttext_model())

        except IOError:
            logger.error('load error to master.json')
            raise NotImplementedError('load error to model.bin')
예제 #4
0
def clean_treeparse(tree):
    """
    Given a string representation of a syntactic tree, remove duplicate
    spaces, all newlines, and the initial 'ROOT' tag that the Stanford
    CoreNLP includes in the parse string.
    """
    try:
        cleaned_tree = re.sub(r' {2,}', ' ', tree)
        cleaned_tree = re.sub(r'\n', '', cleaned_tree)
        cleaned_tree = re.sub(r'ROOT', '', cleaned_tree)
        return cleaned_tree
    except TypeError as e:
        logger.error("REGEX FAILED: " + str(tree))
        return "RegexFailed"
예제 #5
0
 def __init__(self):
     self.file = 'output/master.json'
     try:
         if self._lock is None:
             self._lock = threading.Lock()
         with self._lock:
             # ファイルが存在するかどうか
             if os.path.exists(self.file):
                 with codecs.open(self.file, 'r', "utf-8") as f:
                     self._data = json.load(f)
             else:
                 # 空ファイルを作成
                 with codecs.open(self.file, 'w', "utf-8") as f:
                     json.dump({}, f, ensure_ascii=False)
                     self._data = {}
     except IOError:
         logger.error('load error to master.json')
         raise NotImplementedError('load error to master.json')
예제 #6
0
    def parse_item(self, response):
        try:
            key = BlogSpider.pattern.search(response.url).group(1)

            episodes = {}
            for html in response.css('.dv-episode-container'):
                self.parse_episode(episodes, html)

            value = {
                'title':
                app.sanitize(
                    response.css('section > h1::text').extract_first()),
                'story':
                app.sanitize(
                    response.css('div.av-synopsis.avu-full-width > p::text').
                    extract_first()),
                'year':
                app.sanitize(
                    response.css(
                        '[data-automation-id="release-year-badge"]::text').
                    extract_first()),
                'episodes':
                episodes
            }

            # Masterファイルを読み込み(シングルトン)
            master = master_file.MasterFile()
            master.append('amazon', key, value)

            self.check_format(key, value)
        except TypeError as error:
            logger.error(str(error) + ' ' + response.url + ' ')
            tpe, v, tb = sys.exc_info()
            backtrace.hook(reverse=True,
                           strip_path=True,
                           tb=tb,
                           tpe=tpe,
                           value=v)
            open_in_browser(response)
            pass
예제 #7
0
    def parse_episode(self, episodes, html):
        try:
            index = BlogSpider.title_pattern.search(
                html.css('.dv-el-title::text').extract()[1]).group(1)
        except (TypeError, AttributeError) as error:
            logger.error(
                str(error) + ' ' +
                str(html.css('.dv-el-title::text').extract()))
            raise

        title = BlogSpider.title_pattern.search(
            html.css('.dv-el-title::text').extract()[1]).group(2)

        status = ''
        if html.css('.dv-el-prime').extract_first():
            status = 'Prime'
        elif html.css('.dv-el-status-text::text').extract_first():
            status = BlogSpider.status.search(
                html.css('.dv-el-status-text::text').extract_first())
            if status.group(1):
                status = status.group(1)
            else:
                status = ''
        elif html.css('.a-size-base.dv-el-3psub.a-text-normal::text'
                      ).extract_first():
            status = html.css('.a-size-base.dv-el-3psub.a-text-normal::text'
                              ).extract_first()

        episodes[index] = {
            'title':
            app.sanitize(title),
            'story':
            app.sanitize(html.css('p.a-text-normal::text').extract_first()),
            'data-aliases':
            app.sanitize(html.css('::attr(data-aliases)').extract_first()),
            'status':
            app.sanitize(status)
        }