Example #1
0
 def parse(self, response):
     """
     デフォルトメソッド
     """
     for page in response.css('.chronicle_title > a'):
         logger.info(page.css('::text').extract())
         yield response.follow(page, self.parse_season)
Example #2
0
 def parse_season(self, response):
     """
     デフォルトメソッド
     """
     for page in response.css('.subMenuListSeason > li > a'):
         logger.info(page.css('::text').extract())
         yield response.follow(page, self.parse_list)
Example #3
0
    def handle(self, *args, **options):  # pragma: no cover
        """

        """
        processes = options['processes']
        condition = options['condition']
        year = options['year']
        begin = dt.now()
        try:
            if year == 0:
                comms = qs.query_all('comment', ids=False)
            else:
                comms = qs.query_by_year(year, 'comment', ids=False)

            comms = comms.exclude(text='').iterator()

            connections.close_all()
            tagger = taggers.CommentLevelTagger(settings, processes, comms)
            tagger.tag()

        except KeyboardInterrupt:  # pragma: no cover
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'.format(
                helpers.get_elapsed(begin, dt.now())))
Example #4
0
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        condition = options['condition']
        year = options['year']
        begin = dt.now()
        try:
            if year == 0:
                sents = qs.query_all('sentence', ids=False)
            else:
                sents = qs.query_by_year(year, 'sentence', ids=False)

            if condition == 'all':
                sents = sents.exclude(text='').iterator()
            elif condition == 'empty' or condition == 'failed':
                sents = sents.filter(metrics__sentiment={}).exclude(
                    text='').iterator()

            connections.close_all()
            tagger = taggers.SentimentTagger(settings, processes, sents)
            tagger.tag()

        except KeyboardInterrupt:
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'.format(
                helpers.get_elapsed(begin, dt.now())))
Example #5
0
    def handle(self, *args, **options):
        primary = options['primary']
        secondary = options['secondary']

        if not primary or not os.path.exists(os.path.expanduser(primary)):
            raise Exception('{0} is not a valid CSV path'.format(primary))
        if not secondary or not os.path.exists(os.path.expanduser(secondary)):
            raise Exception('{0} is not a valid CSV path'.format(secondary))

        primary_dataset = dict()
        secondary_dataset = dict()

        function = None
        with open(primary) as file_:
            reader = csv.reader(file_)

            for row in reader:
                if 'Function' in row[0]:
                    function = Function()

                    function.name = row[1]
                    function.file = row[2]

                    if function not in primary_dataset:
                        primary_dataset[function] = int(row[3])
                    else:
                        logger.debug(
                            '{0} duplicate in {1}'.format(function, primary)
                        )

        with open(secondary) as file_:
            reader = csv.reader(file_)

            for row in reader:
                if 'Function' in row[0]:
                    function = Function()

                    function.name = row[1]
                    function.file = row[2]

                    if function not in secondary_dataset:
                        secondary_dataset[function] = int(row[3])
                    else:
                        logger.debug(
                            '{0} duplicate in {1}'.format(function, secondary)
                        )

        match = 0
        for item in secondary_dataset:
            if item in primary_dataset:
                if secondary_dataset[item] == primary_dataset[item]:
                    match += 1

        logger.info(
            '{0}/{1} having matching SLOC'.format(
                match, len(secondary_dataset)
            )
        )
Example #6
0
 def parse_list(self, response):
     """
     デフォルトメソッド
     """
     # Masterファイルを読み込み(シングルトン)
     master = master_file.MasterFile()
     for page in response.css('span.animeTitle > a'):
         logger.info(*page.css('::text').extract())
         yield response.follow(page, self.parse_item)
     for page in response.css('a.next'):
         yield response.follow(page, self.parse_list)
         master.save()
Example #7
0
 def _update(self, instance, collection):
     slocs = [instance.sloc]
     for item in collection:
         if item == instance:
             slocs.append(item.sloc)
             collection.remove(item)
             break
     instance.sloc = round(statistics.mean(slocs), 0)
     logger.info('Duplicate {0}:{1} replaced with {2}:{3}'.format(
             item, item.sloc, instance, instance.sloc
         ))
     collection.add(instance)
     return collection
Example #8
0
    def check_format(self, key, value):
        if not key  \
                or not value['title'] \
                or not value['story'] \
                or not value['year']:
            logger.info('incomplete content' + key + ' ' + str(value))

        for num in value['episodes']:
            if not value['episodes'][num]['title']  \
                    or not value['episodes'][num]['story'] \
                    or not value['episodes'][num]['status']:
                logger.info('incomplete episode ' + key + ' ' +
                            str(value['episodes'][num]))
Example #9
0
    def handle(self, *args, **options):
        source = options['source']

        if not source or not os.path.exists(os.path.expanduser(source)):
            raise Exception('{0} is not a valid CSV path'.format(source))

        function = None
        duplicates = list()
        functions = set()
        with open(source) as file_:
            reader = csv.reader(file_)
            for row in reader:
                if 'Function' in row[0]:
                    name = row[1]
                    file = row[2]
                    sloc = int(row[3])

                    function = Function.objects.filter(
                        name=name, file=file
                    )

                    if not function.exists():
                        function = Function()

                        function.name = name
                        function.file = file
                        function.sloc = sloc

                        if function not in functions:
                            functions.add(function)
                        else:
                            duplicates.append(function)
                            function = [
                                f for f in functions if f == function
                            ][0]
                            duplicates.append(function)
                            functions.remove(function)

        if len(functions) > 0:
            logger.debug('Adding {0} functions.'.format(len(functions)))
            Function.objects.bulk_create(functions)

            if len(duplicates) > 0:
                for function in duplicates:
                    logger.debug(
                        'Duplicate {0} in {1} with {2} SLOC'.format(
                            function.name, function.file, function.sloc
                        )
                    )
            logger.info('Appended {0} functions.'.format(len(functions)))
Example #10
0
    def handle(self, *args, **options):
        source = options['source']

        if not source or not os.path.exists(os.path.expanduser(source)):
            raise Exception('{0} is not a valid CSV path'.format(source))

        functions = set()
        files = set()
        with open(source) as file_:
            reader = csv.reader(file_)
            for row in reader:
                if 'Function' in row[0]:
                    name = row[1]
                    file = row[2]
                    sloc = int(row[3])

                    function = Function()

                    function.name = name
                    function.file = file
                    function.sloc = sloc

                    if function not in functions:
                        functions.add(function)
                    else:
                        functions = self._update(function, functions)
                elif 'File' in row[0]:
                    name = row[2]
                    sloc = int(row[3])

                    file_ = File()

                    file_.name = name
                    file_.sloc = sloc

                    if file_ not in files:
                        files.add(file_)
                    else:
                        files = self._update(file_, files)

        if len(functions) > 0:
            logger.debug('Adding {0} functions.'.format(len(functions)))
            Function.objects.bulk_create(functions)
            logger.info('Loaded {0} functions.'.format(len(functions)))

        if len(files) > 0:
            logger.debug('Adding {0} files.'.format(len(files)))
            File.objects.bulk_create(files)
            logger.info('Loaded {0} files.'.format(len(files)))
Example #11
0
    def handle(self, *args, **options):
        source = options["source"]

        if not source or not os.path.exists(os.path.expanduser(source)):
            raise Exception("{0} is not a valid CSV path".format(source))

        found = 0
        missing = set()
        with open(source) as file_:
            reader = csv.reader(file_)
            function = None
            for row in reader:
                name = row[0]
                file = row[1]
                sloc = int(row[2]) if row[2] else None

                function = None
                functions = Function.objects.filter(name=name)
                if functions.exists():
                    function = functions.filter(file=file)
                    if function.exists():
                        # Exact Match
                        function = function.get()
                    elif functions.count() == 1:
                        # Guesstimate: When filtering a function by name alone
                        # yields one result, it may be appropriate to assume
                        # that result to be the one we are looking for.
                        function = functions.get()
                        logger.debug("{0} ~ {1}".format(name, function.identity))

                if function is not None and type(function) is Function:
                    found += 1
                else:
                    missing.add("{0}@{1}".format(name, file))

        logger.info("{0} functions have SLOC".format(found))
        logger.info("{0} functions do not have SLOC".format(len(missing)))
        for item in missing:
            logger.debug("{0}".format(item))
Example #12
0
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        begin = dt.now()
        try:
            review_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            print("REVIEWS:")
            for i in review_ids.keys():
                review_ids[i] = list(qs.query_by_year(i, 'review', ids=True))
                print("\t{0}: {1}".format(str(i), str(len(review_ids[i]))))
                connections.close_all()


            comment_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            message_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            for year, ids in review_ids.items():
                comment_ids[year] = list(qs.query_by_year(year, 'comment', ids=True))
                connections.close_all()
                message_ids[year] = list(qs.query_by_year(year, 'message', ids=True))
                connections.close_all()

            print("COMMENTS:")
            for k, v in comment_ids.items():
                print("\t{0}: {1}".format(str(k), str(len(v))))

            print("MESSAGES:")
            for k, v in message_ids.items():
                print("\t{0}: {1}".format(str(k), str(len(v))))

            comment_sentences_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            message_sentences_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }

            print("COMMENT_SENTENCES:")
            for year, ids in comment_ids.items():
                comments = Comment.objects.filter(id__in=ids)
                connections.close_all()
                for c in comments:
                    comment_sentences_ids[year] += list(c.sentences.values_list('id'))
                print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year]))))
#            for year, ids in comment_ids.items():
#                comment_sentences_ids[year] = list(CommentSentences.objects.filter(comment_id__in=ids).values_list('sentence_id', flat=True))
#                connections.close_all()
#                print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year]))))

            print("MESSAGE_SENTENCES:")
            for year, ids in message_ids.items():
                messages = Message.objects.filter(id__in=ids)
                connections.close_all()
                for m in messages:
                    message_sentences_ids[year] += list(m.sentences.values_list('id'))
                print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year]))))
#            for year, ids, in message_ids.items():
#                message_sentences_ids[year] = list(MessageSentences.objects.filter(message_id__in=ids).values_list('sentence_id', flat=True))
#                connections.close_all()
#                print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year]))))

            sentences = list(qs.query_all('sentence', ids=False).values_list('id', 'text'))
            connections.close_all()

            orphans = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            duplicates = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            for sentence in sentences:
                for year in review_ids.keys():
                    print("YEAR: {0}".format(str(year)))
                    if sentence[0] not in comment_sentences_ids[year] and sentence[0] not in message_sentences_ids[year]:
                        orphans[year].append(sentence[0])
                    elif sentence[0] in comment_sentences_ids[year] and sentence[0] in message_sentences_ids[year]:
                        duplicates[year].append(sentence[0])

            print("================")
            print("ORPHANS:")
            for year, ids in orphans.items():
                print("\t{0}: {1}".format(str(year), str(len(ids))))

            print("DUPLICATES:")
            for year, ids in duplicates.items():
                print("\t{0}: {1}".format(str(year), str(len(ids))))

            connections.close_all()

        except KeyboardInterrupt:
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'
                .format(helpers.get_elapsed(begin, dt.now())))