def __init__(self): try: if self._lock is None: self._lock = threading.Lock() except IOError: logger.error('Error') raise NotImplementedError('Error')
def get_parent(raw, comments): """Return parent of a comment. Parameters ---------- raw: str The raw text of the comment message for which the parent is to be identified. comments: list A list of instances of app.models.Comment representing the comments that have already been processed. The list is used to look for the parent of the comment represented by the raw text argument. Returns ------- comment: object An instance of app.models.Comment that represents the parent of the comment specified using |raw|. None is returned if no parent was found. """ match = RESPONSE_HEAD_RE.match(raw) if match is not None: # Extract date and time and author from comment reponse header header = match.group(0) match = DATE_TIME_RE.search(header) if match is None: logger.error('NONE: ' + raw) return None components = match.groupdict() timestamp = '{date} {time}'.format(date=components['date'].replace( '/', '-'), time=components['time']) match = AUTHOR_RE.search(header) author = match.group(1) parents = list() for comment in comments: # Compare timestamps without milliseconds ind = len(comment.posted) if '.' in comment.posted: ind = comment.posted.index('.') if comment.posted[:ind] == timestamp: parents.append(comment) if len(parents) == 1: # Only one parent found by timestamp matching. Return it. return parents[0] else: # Multiple parents found by timestamp matching. Use full text. for comment in comments: text = '> {}'.format(comment.text.replace(r'\n', r'\n> ')) if text in raw: return comment return None if len(comments) == 0 else comments[-1]
def __init__(self): try: if self._lock is None: self._lock = threading.Lock() with self._lock: if app.exists_output_fasttext_model(): self._model = ft.load_model( app.get_output_fasttext_model()) except IOError: logger.error('load error to master.json') raise NotImplementedError('load error to model.bin')
def clean_treeparse(tree): """ Given a string representation of a syntactic tree, remove duplicate spaces, all newlines, and the initial 'ROOT' tag that the Stanford CoreNLP includes in the parse string. """ try: cleaned_tree = re.sub(r' {2,}', ' ', tree) cleaned_tree = re.sub(r'\n', '', cleaned_tree) cleaned_tree = re.sub(r'ROOT', '', cleaned_tree) return cleaned_tree except TypeError as e: logger.error("REGEX FAILED: " + str(tree)) return "RegexFailed"
def __init__(self): self.file = 'output/master.json' try: if self._lock is None: self._lock = threading.Lock() with self._lock: # ファイルが存在するかどうか if os.path.exists(self.file): with codecs.open(self.file, 'r', "utf-8") as f: self._data = json.load(f) else: # 空ファイルを作成 with codecs.open(self.file, 'w', "utf-8") as f: json.dump({}, f, ensure_ascii=False) self._data = {} except IOError: logger.error('load error to master.json') raise NotImplementedError('load error to master.json')
def parse_item(self, response): try: key = BlogSpider.pattern.search(response.url).group(1) episodes = {} for html in response.css('.dv-episode-container'): self.parse_episode(episodes, html) value = { 'title': app.sanitize( response.css('section > h1::text').extract_first()), 'story': app.sanitize( response.css('div.av-synopsis.avu-full-width > p::text'). extract_first()), 'year': app.sanitize( response.css( '[data-automation-id="release-year-badge"]::text'). extract_first()), 'episodes': episodes } # Masterファイルを読み込み(シングルトン) master = master_file.MasterFile() master.append('amazon', key, value) self.check_format(key, value) except TypeError as error: logger.error(str(error) + ' ' + response.url + ' ') tpe, v, tb = sys.exc_info() backtrace.hook(reverse=True, strip_path=True, tb=tb, tpe=tpe, value=v) open_in_browser(response) pass
def parse_episode(self, episodes, html): try: index = BlogSpider.title_pattern.search( html.css('.dv-el-title::text').extract()[1]).group(1) except (TypeError, AttributeError) as error: logger.error( str(error) + ' ' + str(html.css('.dv-el-title::text').extract())) raise title = BlogSpider.title_pattern.search( html.css('.dv-el-title::text').extract()[1]).group(2) status = '' if html.css('.dv-el-prime').extract_first(): status = 'Prime' elif html.css('.dv-el-status-text::text').extract_first(): status = BlogSpider.status.search( html.css('.dv-el-status-text::text').extract_first()) if status.group(1): status = status.group(1) else: status = '' elif html.css('.a-size-base.dv-el-3psub.a-text-normal::text' ).extract_first(): status = html.css('.a-size-base.dv-el-3psub.a-text-normal::text' ).extract_first() episodes[index] = { 'title': app.sanitize(title), 'story': app.sanitize(html.css('p.a-text-normal::text').extract_first()), 'data-aliases': app.sanitize(html.css('::attr(data-aliases)').extract_first()), 'status': app.sanitize(status) }