def parse(self, response): """ デフォルトメソッド """ for page in response.css('.chronicle_title > a'): logger.info(page.css('::text').extract()) yield response.follow(page, self.parse_season)
def parse_season(self, response): """ デフォルトメソッド """ for page in response.css('.subMenuListSeason > li > a'): logger.info(page.css('::text').extract()) yield response.follow(page, self.parse_list)
def handle(self, *args, **options): # pragma: no cover """ """ processes = options['processes'] condition = options['condition'] year = options['year'] begin = dt.now() try: if year == 0: comms = qs.query_all('comment', ids=False) else: comms = qs.query_by_year(year, 'comment', ids=False) comms = comms.exclude(text='').iterator() connections.close_all() tagger = taggers.CommentLevelTagger(settings, processes, comms) tagger.tag() except KeyboardInterrupt: # pragma: no cover logger.warning('Attempting to abort...') finally: logger.info('Time: {:.2f} minutes.'.format( helpers.get_elapsed(begin, dt.now())))
def handle(self, *args, **options): """ """ processes = options['processes'] condition = options['condition'] year = options['year'] begin = dt.now() try: if year == 0: sents = qs.query_all('sentence', ids=False) else: sents = qs.query_by_year(year, 'sentence', ids=False) if condition == 'all': sents = sents.exclude(text='').iterator() elif condition == 'empty' or condition == 'failed': sents = sents.filter(metrics__sentiment={}).exclude( text='').iterator() connections.close_all() tagger = taggers.SentimentTagger(settings, processes, sents) tagger.tag() except KeyboardInterrupt: logger.warning('Attempting to abort...') finally: logger.info('Time: {:.2f} minutes.'.format( helpers.get_elapsed(begin, dt.now())))
def handle(self, *args, **options): primary = options['primary'] secondary = options['secondary'] if not primary or not os.path.exists(os.path.expanduser(primary)): raise Exception('{0} is not a valid CSV path'.format(primary)) if not secondary or not os.path.exists(os.path.expanduser(secondary)): raise Exception('{0} is not a valid CSV path'.format(secondary)) primary_dataset = dict() secondary_dataset = dict() function = None with open(primary) as file_: reader = csv.reader(file_) for row in reader: if 'Function' in row[0]: function = Function() function.name = row[1] function.file = row[2] if function not in primary_dataset: primary_dataset[function] = int(row[3]) else: logger.debug( '{0} duplicate in {1}'.format(function, primary) ) with open(secondary) as file_: reader = csv.reader(file_) for row in reader: if 'Function' in row[0]: function = Function() function.name = row[1] function.file = row[2] if function not in secondary_dataset: secondary_dataset[function] = int(row[3]) else: logger.debug( '{0} duplicate in {1}'.format(function, secondary) ) match = 0 for item in secondary_dataset: if item in primary_dataset: if secondary_dataset[item] == primary_dataset[item]: match += 1 logger.info( '{0}/{1} having matching SLOC'.format( match, len(secondary_dataset) ) )
def parse_list(self, response): """ デフォルトメソッド """ # Masterファイルを読み込み(シングルトン) master = master_file.MasterFile() for page in response.css('span.animeTitle > a'): logger.info(*page.css('::text').extract()) yield response.follow(page, self.parse_item) for page in response.css('a.next'): yield response.follow(page, self.parse_list) master.save()
def _update(self, instance, collection): slocs = [instance.sloc] for item in collection: if item == instance: slocs.append(item.sloc) collection.remove(item) break instance.sloc = round(statistics.mean(slocs), 0) logger.info('Duplicate {0}:{1} replaced with {2}:{3}'.format( item, item.sloc, instance, instance.sloc )) collection.add(instance) return collection
def check_format(self, key, value): if not key \ or not value['title'] \ or not value['story'] \ or not value['year']: logger.info('incomplete content' + key + ' ' + str(value)) for num in value['episodes']: if not value['episodes'][num]['title'] \ or not value['episodes'][num]['story'] \ or not value['episodes'][num]['status']: logger.info('incomplete episode ' + key + ' ' + str(value['episodes'][num]))
def handle(self, *args, **options): source = options['source'] if not source or not os.path.exists(os.path.expanduser(source)): raise Exception('{0} is not a valid CSV path'.format(source)) function = None duplicates = list() functions = set() with open(source) as file_: reader = csv.reader(file_) for row in reader: if 'Function' in row[0]: name = row[1] file = row[2] sloc = int(row[3]) function = Function.objects.filter( name=name, file=file ) if not function.exists(): function = Function() function.name = name function.file = file function.sloc = sloc if function not in functions: functions.add(function) else: duplicates.append(function) function = [ f for f in functions if f == function ][0] duplicates.append(function) functions.remove(function) if len(functions) > 0: logger.debug('Adding {0} functions.'.format(len(functions))) Function.objects.bulk_create(functions) if len(duplicates) > 0: for function in duplicates: logger.debug( 'Duplicate {0} in {1} with {2} SLOC'.format( function.name, function.file, function.sloc ) ) logger.info('Appended {0} functions.'.format(len(functions)))
def handle(self, *args, **options): source = options['source'] if not source or not os.path.exists(os.path.expanduser(source)): raise Exception('{0} is not a valid CSV path'.format(source)) functions = set() files = set() with open(source) as file_: reader = csv.reader(file_) for row in reader: if 'Function' in row[0]: name = row[1] file = row[2] sloc = int(row[3]) function = Function() function.name = name function.file = file function.sloc = sloc if function not in functions: functions.add(function) else: functions = self._update(function, functions) elif 'File' in row[0]: name = row[2] sloc = int(row[3]) file_ = File() file_.name = name file_.sloc = sloc if file_ not in files: files.add(file_) else: files = self._update(file_, files) if len(functions) > 0: logger.debug('Adding {0} functions.'.format(len(functions))) Function.objects.bulk_create(functions) logger.info('Loaded {0} functions.'.format(len(functions))) if len(files) > 0: logger.debug('Adding {0} files.'.format(len(files))) File.objects.bulk_create(files) logger.info('Loaded {0} files.'.format(len(files)))
def handle(self, *args, **options): source = options["source"] if not source or not os.path.exists(os.path.expanduser(source)): raise Exception("{0} is not a valid CSV path".format(source)) found = 0 missing = set() with open(source) as file_: reader = csv.reader(file_) function = None for row in reader: name = row[0] file = row[1] sloc = int(row[2]) if row[2] else None function = None functions = Function.objects.filter(name=name) if functions.exists(): function = functions.filter(file=file) if function.exists(): # Exact Match function = function.get() elif functions.count() == 1: # Guesstimate: When filtering a function by name alone # yields one result, it may be appropriate to assume # that result to be the one we are looking for. function = functions.get() logger.debug("{0} ~ {1}".format(name, function.identity)) if function is not None and type(function) is Function: found += 1 else: missing.add("{0}@{1}".format(name, file)) logger.info("{0} functions have SLOC".format(found)) logger.info("{0} functions do not have SLOC".format(len(missing))) for item in missing: logger.debug("{0}".format(item))
def handle(self, *args, **options): """ """ processes = options['processes'] begin = dt.now() try: review_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } print("REVIEWS:") for i in review_ids.keys(): review_ids[i] = list(qs.query_by_year(i, 'review', ids=True)) print("\t{0}: {1}".format(str(i), str(len(review_ids[i])))) connections.close_all() comment_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } message_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } for year, ids in review_ids.items(): comment_ids[year] = list(qs.query_by_year(year, 'comment', ids=True)) connections.close_all() message_ids[year] = list(qs.query_by_year(year, 'message', ids=True)) connections.close_all() print("COMMENTS:") for k, v in comment_ids.items(): print("\t{0}: {1}".format(str(k), str(len(v)))) print("MESSAGES:") for k, v in message_ids.items(): print("\t{0}: {1}".format(str(k), str(len(v)))) comment_sentences_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } message_sentences_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } print("COMMENT_SENTENCES:") for year, ids in comment_ids.items(): comments = Comment.objects.filter(id__in=ids) connections.close_all() for c in comments: comment_sentences_ids[year] += list(c.sentences.values_list('id')) print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year])))) # for year, ids in comment_ids.items(): # comment_sentences_ids[year] = list(CommentSentences.objects.filter(comment_id__in=ids).values_list('sentence_id', flat=True)) # connections.close_all() # print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year])))) print("MESSAGE_SENTENCES:") for year, ids in message_ids.items(): messages = Message.objects.filter(id__in=ids) connections.close_all() for m in messages: message_sentences_ids[year] += list(m.sentences.values_list('id')) print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year])))) # for year, ids, in message_ids.items(): # message_sentences_ids[year] = list(MessageSentences.objects.filter(message_id__in=ids).values_list('sentence_id', flat=True)) # connections.close_all() # print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year])))) sentences = list(qs.query_all('sentence', ids=False).values_list('id', 'text')) connections.close_all() orphans = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } duplicates = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } for sentence in sentences: for year in review_ids.keys(): print("YEAR: {0}".format(str(year))) if sentence[0] not in comment_sentences_ids[year] and sentence[0] not in message_sentences_ids[year]: orphans[year].append(sentence[0]) elif sentence[0] in comment_sentences_ids[year] and sentence[0] in message_sentences_ids[year]: duplicates[year].append(sentence[0]) print("================") print("ORPHANS:") for year, ids in orphans.items(): print("\t{0}: {1}".format(str(year), str(len(ids)))) print("DUPLICATES:") for year, ids in duplicates.items(): print("\t{0}: {1}".format(str(year), str(len(ids)))) connections.close_all() except KeyboardInterrupt: logger.warning('Attempting to abort...') finally: logger.info('Time: {:.2f} minutes.' .format(helpers.get_elapsed(begin, dt.now())))