コード例 #1
0
ファイル: sentiment.py プロジェクト: andymeneely/sira-nlp
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        condition = options['condition']
        year = options['year']
        begin = dt.now()
        try:
            if year == 0:
                sents = qs.query_all('sentence', ids=False)
            else:
                sents = qs.query_by_year(year, 'sentence', ids=False)

            if condition == 'all':
                sents = sents.exclude(text='').iterator()
            elif condition == 'empty' or condition == 'failed':
                sents = sents.filter(metrics__sentiment={}).exclude(
                    text='').iterator()

            connections.close_all()
            tagger = taggers.SentimentTagger(settings, processes, sents)
            tagger.tag()

        except KeyboardInterrupt:
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'.format(
                helpers.get_elapsed(begin, dt.now())))
コード例 #2
0
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        population = options['population']
        root = options['root']
        year = options['year']
        begin = dt.now()
        try:
            sentences = []
            if year is not None:
                sentences = qs.query_by_year(year, 'sentence', ids=False)
            else:
                sentences = qs.query_all('sentence', ids=False)
            sentences = sentences.iterator()

            connections.close_all()
            tagger = taggers.UncertaintyTagger(settings, processes, sentences,
                                               root)
            tagger.tag()
        except KeyboardInterrupt:
            warning('Attempting to abort.')
        finally:
            info('Time: {:.2f} mins'.format(get_elapsed(begin, dt.now())))
コード例 #3
0
ファイル: sourcecode.py プロジェクト: andymeneely/sira-nlp
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        metrics = options['metrics']
        year = options['year']
        begin = dt.now()
        try:
            #print(len(qs.query_by_year(year, 'token', ids=True)))
            tokens = []
            if year != 0:
                tokens = qs.query_by_year(
                    year, 'token', ids=False).exclude(token='').iterator()
            else:
                tokens = qs.query_all('token',
                                      ids=False).exclude(token='').iterator()
            connections.close_all()
            #print(tokens)
            tagger = taggers.SourceCodeTagger(settings, processes, tokens)
            total = tagger.tag()
            #print(total)
        except KeyboardInterrupt:  # pragma: no cover
            warning('Attempting to abort.')
        finally:
            info('Time: {:.2f} mins'.format(get_elapsed(begin, dt.now())))
コード例 #4
0
    def handle(self, *args, **options):  # pragma: no cover
        """

        """
        processes = options['processes']
        condition = options['condition']
        year = options['year']
        begin = dt.now()
        try:
            if year == 0:
                comms = qs.query_all('comment', ids=False)
            else:
                comms = qs.query_by_year(year, 'comment', ids=False)

            comms = comms.exclude(text='').iterator()

            connections.close_all()
            tagger = taggers.CommentLevelTagger(settings, processes, comms)
            tagger.tag()

        except KeyboardInterrupt:  # pragma: no cover
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'.format(
                helpers.get_elapsed(begin, dt.now())))
コード例 #5
0
ファイル: loadMetrics.py プロジェクト: andymeneely/sira-nlp
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        metrics = options['metrics']
        year = options['year']
        begin = dt.now()
        try:
            sentences = []
            if year != 0:
                sentences = qs.query_by_year(year, 'sentence', ids=False).exclude(text='')
            else:
                sentences = qs.query_all('sentence', ids=False).exclude(text='')
            connections.close_all()
            tagger = taggers.MetricsTagger(settings, processes, sentences, metrics)
            tagger.tag()
        except KeyboardInterrupt: # pragma: no cover
            warning('Attempting to abort.')
        finally:
            info('Time: {:.2f} mins'.format(get_elapsed(begin, dt.now())))
コード例 #6
0
ファイル: politeness.py プロジェクト: andymeneely/sira-nlp
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        population = options['population']
        year = options['year']
        begin = dt.now()
        try:
            if year == 0:
                sents = qs.query_all('sentence', ids=False).exclude(text='') \
                          .iterator()
            else:
                sents = qs.query_by_year(year, 'sentence', ids=False).exclude(text='') \
                          .iterator()

            connections.close_all()
            tagger = taggers.PolitenessTagger(settings, processes, sents)
            tagger.tag()

        except KeyboardInterrupt:
            warning('Attempting to abort.')
        finally:
            info('Time: {:.2f} mins'.format(get_elapsed(begin, dt.now())))
コード例 #7
0
ファイル: helpers.py プロジェクト: andymeneely/sira-nlp
JSON_NULL = json.dumps(None)

# The two regular expressions that follow match components of the header text
# that is automatically inserted when a developer responds to a comment on
# Rietveld.
# E.g. On 2008/01/01 at 00:00:01, Raymond Reddington wrote:

# Match the date and time in header that is inserted to comment responses
# E.g. 2008/01/01 at 00:00:01
DATE_TIME_RE = re.compile(
    '(?P<date>\d{4}/\d{2}/\d{2})(?:\s|\sat\s)(?P<time>\d{2}:\d{2}:\d{2})')
# Match the name of the author in header that is inserted to comment response
# E.g. Raymond Reddington
AUTHOR_RE = re.compile(', (.*) wrote:')

TOKENS = qs.query_all('token', ids=False)
PRONOUNS_1 = [
    'I', 'ME', 'MYSELF', 'MY', 'MINE', 'WE', 'US', 'OURSELVES', 'OUR', 'OURS'
]
PRONOUNS_2 = ['YOU', 'YOURSELF', 'YOUR', 'YOURS', 'YOURSELVES']
PRONOUNS_3 = [
    'SHE', 'ITSELF', 'HER', 'HE', 'ITS', 'HIM', 'IT', 'HIMSELF', 'HERSELF',
    'HERS', 'HIS', 'THEY', 'THEM', 'THEMSELVES', 'THEIR', 'THEIRS'
]


def get_syllable_count(tokens):
    total = 0
    pron = []
    for token in tokens:
        word = token.strip("\n")
コード例 #8
0
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        year = options['year']
        begin = dt.now()
        try:
            info('loaddb Command')
            info('  Years: {}'.format(settings.YEARS))

            if year != 0:
                settings.YEARS = [year]

            loader = loaders.BugLoader(settings, processes)
            count = loader.load()
            info('  {:,} bugs loaded'.format(count))

            loader = loaders.VulnerabilityLoader(settings, processes)
            count = loader.load()
            info('  {:,} vulnerabilities loaded'.format(count))

            loader = loaders.ReviewLoader(settings, processes)
            count = loader.load()
            info('  {:,} reviews loaded'.format(count))

            tagger = taggers.MissedVulnerabilityTagger(settings, processes)
            count = tagger.tag()
            info('  {:,} reviews missed a vulnerability'.format(count))

            if year != 0:
                ids = qs.query_by_year(year, 'review', True)
            else:
                ids = qs.query_all('review', True)
            connections.close_all()  # Hack

            # Comments
            loader = loaders.CommentLoader(settings, processes, ids)
            count = loader.load()
            info('  {:,} comments loaded'.format(count))
            connections.close_all()  # Hack
            loader = loaders.SentenceCommentLoader(settings, processes, ids)
            count = loader.load()
            info('  {:,} sentences loaded'.format(count))
            connections.close_all()  # Hack

            tagger = taggers.UsefulCommentTagger(settings, processes, ids)
            count = tagger.tag()
            info('  {:,} comments were useful'.format(count))

            # Messages
            connections.close_all()  # Hack
            loader = loaders.MessageLoader(settings, processes, ids)
            count = loader.load()
            info('  {:,} messages loaded'.format(count))
            connections.close_all()  # Hack
            loader = loaders.SentenceMessageLoader(settings, processes, ids)
            count = loader.load()
            info('  {:,} sentences loaded'.format(count))
            connections.close_all()  # Hack

            # Tokens
            loader = loaders.TokenLoader(settings, processes, ids)
            count = loader.load()
            info('  {:,} tokens loaded'.format(count))

            with connection.cursor() as cursor:
                cursor.execute(
                    'REFRESH MATERIALIZED VIEW {};'.format('vw_review_token'))
                cursor.execute(
                    'REFRESH MATERIALIZED VIEW {};'.format('vw_review_lemma'))
        except KeyboardInterrupt:  # pragma: no cover
            warning('Attempting to abort.')
        finally:
            info('Time: {:.2f} mins'.format(get_elapsed(begin, dt.now())))
コード例 #9
0
ファイル: sandbox.py プロジェクト: andymeneely/sira-nlp
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        begin = dt.now()
        try:
            review_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            print("REVIEWS:")
            for i in review_ids.keys():
                review_ids[i] = list(qs.query_by_year(i, 'review', ids=True))
                print("\t{0}: {1}".format(str(i), str(len(review_ids[i]))))
                connections.close_all()


            comment_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            message_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            for year, ids in review_ids.items():
                comment_ids[year] = list(qs.query_by_year(year, 'comment', ids=True))
                connections.close_all()
                message_ids[year] = list(qs.query_by_year(year, 'message', ids=True))
                connections.close_all()

            print("COMMENTS:")
            for k, v in comment_ids.items():
                print("\t{0}: {1}".format(str(k), str(len(v))))

            print("MESSAGES:")
            for k, v in message_ids.items():
                print("\t{0}: {1}".format(str(k), str(len(v))))

            comment_sentences_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            message_sentences_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }

            print("COMMENT_SENTENCES:")
            for year, ids in comment_ids.items():
                comments = Comment.objects.filter(id__in=ids)
                connections.close_all()
                for c in comments:
                    comment_sentences_ids[year] += list(c.sentences.values_list('id'))
                print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year]))))
#            for year, ids in comment_ids.items():
#                comment_sentences_ids[year] = list(CommentSentences.objects.filter(comment_id__in=ids).values_list('sentence_id', flat=True))
#                connections.close_all()
#                print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year]))))

            print("MESSAGE_SENTENCES:")
            for year, ids in message_ids.items():
                messages = Message.objects.filter(id__in=ids)
                connections.close_all()
                for m in messages:
                    message_sentences_ids[year] += list(m.sentences.values_list('id'))
                print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year]))))
#            for year, ids, in message_ids.items():
#                message_sentences_ids[year] = list(MessageSentences.objects.filter(message_id__in=ids).values_list('sentence_id', flat=True))
#                connections.close_all()
#                print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year]))))

            sentences = list(qs.query_all('sentence', ids=False).values_list('id', 'text'))
            connections.close_all()

            orphans = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            duplicates = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            for sentence in sentences:
                for year in review_ids.keys():
                    print("YEAR: {0}".format(str(year)))
                    if sentence[0] not in comment_sentences_ids[year] and sentence[0] not in message_sentences_ids[year]:
                        orphans[year].append(sentence[0])
                    elif sentence[0] in comment_sentences_ids[year] and sentence[0] in message_sentences_ids[year]:
                        duplicates[year].append(sentence[0])

            print("================")
            print("ORPHANS:")
            for year, ids in orphans.items():
                print("\t{0}: {1}".format(str(year), str(len(ids))))

            print("DUPLICATES:")
            for year, ids in duplicates.items():
                print("\t{0}: {1}".format(str(year), str(len(ids))))

            connections.close_all()

        except KeyboardInterrupt:
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'
                .format(helpers.get_elapsed(begin, dt.now())))