Python queryset_iterator 예제들, comment_parser.queryset_iterator.queryset_iterator Python 예제들

예제 #1

0

파일 보기

 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     
     for database in self.selected_dbs:    
         
         print "Processing database " + database
         cursor = connections[database].cursor()
         
         print "Grabbing newsitems data"
         queryset = Newsitem.objects\
                            .using(database)\
                            .exclude(cat1__isnull=True)\
                            .only('cat1')
         newsitems_cat1 = {}
         
         self.pbar_setup(maxval=queryset.count())
         for newsitem in queryset_iterator(queryset, chunksize=1000):
             if newsitem.cat1 == None:
                 pdb.set_trace()
                 pass
             newsitems_cat1[newsitem.id] = newsitem.cat1
             self.pbar_increment()
         self.pbar_destroy()
         
         print "Grabbing comments and their texts, then dumping them to target folder ..."
         queryset = Comment.objects\
                           .using(database)\
                           .filter(newsitem_id__in=newsitems_cat1.keys())\
                           .only('date','newsitem_id','parent_id','text')
         
         self.pbar_setup(maxval=queryset.count())
         for comment in queryset_iterator(queryset, chunksize=1000):
             if not comment.text or not comment.text.text:
                 continue
             cat1 = newsitems_cat1[comment.newsitem_id]
             if cat1 == None:
                 pdb.set_trace()
                 pass
             filename = "{}_{}_{}.txt".format(
                 cat1,
                 comment.date.date().isoformat(),
                 comment.id)
             target_filename = os.path.join(options['TARGET_DIR_ALL'], filename)
             fileobj = codecs.open(target_filename, 'w', 'utf-8')
             fileobj.write(comment.text.text)         
             fileobj.close()
             if comment.parent_id == None:
                 target_filename = os.path.join(options['TARGET_DIR_NEWS'], filename)
                 fileobj = codecs.open(target_filename, 'w', 'utf-8')
                 fileobj.write(comment.text.text)         
                 fileobj.close()                    
             self.pbar_increment()
         self.pbar_destroy()                  
     
     self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #2

0

파일 보기

파일: export_newsitems.py 프로젝트: TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        images_re = re.compile("!\[(.+)\]\((.+)\)")
        socials_re = re.compile("Facebook Twitter Pinterest")

        for database in self.selected_dbs:

            print "Processing database " + database
            cursor = connections[database].cursor()

            print "Grabbing newsitems and their texts, then dumping them to target folder ..."
            queryset = Newsitem.objects\
                               .using(database)\
                               .exclude(cat1__isnull=True)\
                               .only('date','cat1','title','text')\

            self.pbar_setup(maxval=queryset.count())
            for newsitem in queryset_iterator(queryset, chunksize=1000):
                if not newsitem.text or not newsitem.text.text:
                    continue
                filename = "{}_{}_news_{}.txt".format(
                    newsitem.cat1,
                    newsitem.date.date().isoformat(), newsitem.id)
                target_filename = os.path.join(options['TARGET_DIR'], filename)
                fileobj = codecs.open(target_filename, 'w', 'utf-8')
                fileobj.write(newsitem.title + ". \n" + newsitem.text.text)
                self.pbar_increment()
            self.pbar_destroy()

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #3

0

파일 보기

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        citations_re = re.compile("\".*\"")

        for database in self.selected_dbs:

            print "Processing database " + database
            cursor = connections[database].cursor()

            print "Grabbing newsitems and calculating..."
            queryset = Text.objects.using(database).filter(
                newsitem__isnull=False)

            self.pbar_setup(maxval=queryset.count())
            for text in queryset_iterator(queryset, chunksize=1000):

                qty_citations = len(citations_re.findall(text.text))

                if qty_citations > 0:
                    query = """
                    UPDATE newsitem
                       SET qty_citations = %s
                     WHERE text_id = %s
                    """
                    cursor.execute(query, [qty_citations, text.id])

                self.pbar_increment()
            self.pbar_destroy()

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #4

0

파일 보기

파일: naive_bayes.py 프로젝트: TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        print "Loading classifier"
        classifier_path = os.path.join(
            settings.BASE_DIR,
            "lda_stats/classifiers/naive_bayes.classifier.cpickle")
        nb = cPickle.load(open(classifier_path, 'rb'))

        filters = {}
        if options['only_newsitems']:
            filters['newsitem__isnull'] = False
        elif options['only_comments']:
            filters['comment__isnull'] = False

        for database in self.selected_dbs:

            print "Processing database " + database

            algorithm_naive_bayes, created = Algorithm.objects.using(database)\
                                                .get_or_create(name="naive_bayes")

            print "Removing previous results"
            query = """
            DELETE FROM result WHERE algorithm_id IN (
                SELECT id
                  FROM algorithm 
                 WHERE name = 'naive_bayes'
            )"""
            cursor = connections[database].cursor()
            cursor.execute(query)

            print "Querying database"
            queryset = Text.objects.using(database).filter(**filters)

            results = []

            print "Calculating..."
            self.pbar_setup(maxval=queryset.count())
            for text in queryset_iterator(queryset, chunksize=2000):
                estimate = nb.predict([
                    utils.TFIDF_to_list(utils.TFIDF(utils.tokenize(text.text)))
                ])
                results.append(
                    Result(algorithm=algorithm_naive_bayes,
                           text=text,
                           value=str(estimate[0])))
                self.pbar_increment()

                if len(results) > 100000:
                    print "\nSaving partial results..."
                    Result.objects.using(database).bulk_create(results)
                    results = []

            self.pbar_destroy()

            print "Saving results"
            Result.objects.using(database).bulk_create(results)

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #5

0

파일 보기

파일: update_topics.py 프로젝트: TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        queryset = Newsitem.objects.filter(topics__isnull=True)

        self.pbar_setup(maxval=queryset.count())

        done = 0

        for newsitem in queryset_iterator(queryset):
            try:
                resp = requests.get(newsitem.url)
                soup = BeautifulSoup(resp.text, "html.parser")
                for meta in soup('meta'):
                    attrs = meta.__dict__['attrs']
                    if 'property' in attrs and attrs[
                            'property'] == 'article:tag':
                        with transaction.atomic():
                            for topic_name in attrs['content'].split(','):
                                topic, created = Topic.objects.get_or_create(
                                    name=topic_name)
                                newsitem.topics.add(topic)
                            newsitem.save()
                        break
            except Exception as e:
                print "Error processing URL: {0} -=- message: {1}".format(
                    newsitem.url, e)

            self.pbar_increment()

        self.pbar_destroy()
        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #6

0

파일 보기

def migrate(apps, schema_editor, direction):
    pbar_widgets = [
        SimpleProgress(), ' ',
        Percentage(), ' ',
        Bar(), ' ',
        Timer(), ' ',
        AdaptiveETA()
    ]

    for model in ['Comment', 'Newsitem']:
        queryset = globals()[model].objects.filter(text__isnull=True)
        if queryset.count() == 0:
            continue
        print "Processing " + model
        pbar = ProgressBar(widgets=pbar_widgets, maxval=queryset.count())
        done = 0
        pbar.start()
        tokenizer = RegexpTokenizer(r'\w+')
        for item in queryset_iterator(queryset, chunksize=20):
            if item.content:
                if direction == "forward":
                    item.text = Text.objects.create(
                        text=item.content,
                        wordcount=len(tokenizer.tokenize(item.content)))
                elif direction == "backwards":
                    item.content = item.text.text
                item.save()
                done += 1
                pbar.update(done)
        pbar.finish()

예제 #7

0

파일 보기

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)
        
        self.html_parser = HTMLParser.HTMLParser()
        
        for db_name in self.selected_dbs:
            print "Calculating wordcount for database " + db_name        
            queryset = Text.objects.using(db_name).all()
            
            with transaction.atomic():

                self.pbar_setup(maxval=queryset.count())
                for text in queryset_iterator(queryset, chunksize=10000):
                
                    wordcount = len(self.clean_text(text.text).split())
                    
                    text.wordcount = wordcount
                    text.save()
                    
                    self.pbar_increment()
            
                self.pbar_destroy()  
            
        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #8

0

파일 보기

파일: authors_summary.py 프로젝트: TheResearchProject/CommentParser

    def generate_author_data(self, database):
        self.author_data = {}
        for model in [Newsitem, Comment]:

            if model == Newsitem:
                model_name = "Newsitem"
            else:
                model_name = "Comment"

            self.stdout.write("Processing model " + model_name)

            queryset = model.objects.using(database)\
                                    .filter(text__isnull=False)\
                                    .select_related('text__text')

            if model == Comment:
                queryset = queryset.select_related('newsitem__idauthor',
                                                   'parent__authorid')

            self.pbar_setup(maxval=queryset.count())
            queryset_iter = queryset_iterator(queryset, chunksize=100)
            pool = Pool()

            #pool.map_async(work, queryset_iter, callback=self.pbar_increment())

            #for item, tokenized in pool.imap_unordered(tokenize, queryset_iter, 100):
            for item in queryset_iter:
                item, tokenized = tokenize(item)
                self.work(item, tokenized)

                self.pbar_increment()

            self.pbar_destroy()

예제 #9

0

파일 보기

    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        filters = {}
        if options['only_newsitems']:
            filters['newsitem__isnull'] = False
        elif options['only_comments']:
            filters['comment__isnull'] = False

        for database in self.selected_dbs:

            print "Processing database " + database

            algorithm_nlp_rake, _ = Algorithm.objects.using(database)\
                                                 .get_or_create(name="nlp_rake")

            queryset = Text.objects.using(database).filter(**filters)

            print "Removing previous results"
            query = """
            DELETE FROM keyword WHERE algorithm_id = %s
            """
            cursor = connections[database].cursor()
            cursor.execute(query, [algorithm_nlp_rake.id])

            queryset_iter = queryset_iterator(queryset, chunksize=1000)

            print "Calculating..."
            self.pbar_setup(maxval=queryset.count())

            results = []

            pool = Pool()
            bulk_create_process = None

            for text in queryset_iter:
                keywords = work(text)
                for keyword, score in keywords:
                    results.append(
                        Keyword(
                            algorithm=algorithm_nlp_rake,
                            text=text,
                            keyword=keyword,
                            score=score,
                        ))

                    if len(results) > 250:
                        Keyword.objects.using(database).bulk_create(results)
                        results = []

                self.pbar_increment()

            self.pbar_destroy()

            if len(results) > 0:
                print "Saving last results..."
                Keyword.objects.using(database).bulk_create(results)

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #10

0

파일 보기

파일: calc_hottopics.py 프로젝트: TheResearchProject/CommentParser

    def lda(self,
            database,
            table="Comment",
            date_to=None,
            date_from=None,
            num_topics=10,
            alpha=0.5,
            beta=0.5,
            iteration_count=25,
            smart_init=False,
            phrase=False):

        model = self.str_to_class(table)
        items = model.objects.using(database).exclude(text__isnull=True)\
                                             .filter(date__gte=date_from,
                                                     date__lte=date_to)\
                                             .only('text')

        if items.count() == 0:
            return []

        contents = [i.text.text for i in items]

        all_docs = []
        if phrase:
            for i in queryset_iterator(items, chunksize=100):
                rake = Rake('stopwords.txt')
                keywords = rake.run(i.text.text)
                all_docs.append([k[0] for k in keywords if " " in k[0]])
        else:
            for i in queryset_iterator(items, chunksize=100):
                tokenized = nltk.word_tokenize(
                    i.text.text.encode("ascii", "ignore"))
                all_docs.append(tokenized)

        stopwords = Stopwords('stopwords.txt')
        voca = Vocabulary(stopwords, excluds_stopwords=True)
        docs = [voca.doc_to_ids(doc) for doc in all_docs]

        lda = LDA(num_topics, alpha, beta, docs, voca.size(), smart_init)

        topics = lda_learning(lda, iteration_count, voca)

        return topics

예제 #11

0

파일 보기

 def check_items(self):
     queryset = Newsitem.objects.filter(Q(lead="")|
                                        Q(text__isnull=True)|
                                        Q(title="")|
                                        Q(date__isnull=True)|
                                        Q(idauthor=""))
     
     for newsitem in queryset_iterator(queryset, chunksize=100):
         self.stdout.write("Processing URL "+newsitem.url)
         soup = BeautifulSoup(requests.get(newsitem.url).text, "html.parser")
         self.update_news_info(newsitem, soup)
         newsitem.save()

예제 #12

0

파일 보기

 def handle(self, *args, **options):
     super(Command, self).handle(self, *args, **options)
     
     if len(self.selected_dbs) > 1:
         self.stdout.write(self.style.ERROR('You must choose a specific DB for this command'))
         return
     
     database = self.selected_dbs[0]
     
     current_text_id = 560000
     
     for model in ['Newsitem']:
         queryset = globals()[model].objects.using(database).filter(text__isnull=True)
         if queryset.count() == 0:
             continue
             
         text_list = []    
         item_list = []
             
         print "Processing " + model
         self.pbar_setup(maxval=queryset.count())     
         tokenizer = RegexpTokenizer(r'\w+')
         for item in queryset_iterator(queryset, chunksize=1000):
             if item.content:
                 current_text = Text(
                     id=current_text_id,
                     text=item.content,
                     wordcount=len(tokenizer.tokenize(item.content)))
                 text_list.append(current_text)
                 item.text = current_text
                 item_list.append(item)
                 current_text_id += 1
             self.pbar_increment()
             
             if len(text_list) >= 2000:
                 Text.objects.using(database).bulk_create(text_list)
                 with transaction.atomic(database):
                     for (item, text) in itertools.izip(item_list, text_list):
                         item.text = text
                         item.save()
                 item_list = []
                 text_list = []
             
         self.pbar_destroy()
         
         print "Saving final results..."
         Text.objects.using(database).bulk_create(text_list)
         with transaction.atomic(database):
             for (item, text) in itertools.izip(item_list, text_list):
                 item.text = text
                 item.save()
         
     self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #13

0

파일 보기

파일: calc_qty_images_videos.py 프로젝트: TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        images_re = re.compile("!\[(.+)\]\((.+)\)\n\n.+Photograph: .+\n\n")
        video_re = re.compile(u' – video[ \w\*]*')

        for database in self.selected_dbs:

            print "Processing database " + database
            cursor = connections[database].cursor()

            print "Grabbing newsitems and calculating..."
            queryset = Text.objects.using(database).filter(
                newsitem__isnull=False)

            self.pbar_setup(maxval=queryset.count())
            for text in queryset_iterator(queryset, chunksize=1000):
                new_text = ""

                subn = images_re.subn("", text.text)
                qty_images = subn[1]

                qty_videos = 0

                for line in subn[0].split("\n"):
                    if line.lower().strip() == "read more":
                        continue
                    elif line.lower().strip() == "facebook twitter pinterest":
                        continue
                    elif video_re.search(line):
                        qty_videos += 1
                    else:
                        new_text += line + "\n"

                text.text = new_text
                text.save()

                if qty_images > 0 or qty_videos > 0:
                    query = """
                    UPDATE newsitem
                       SET qty_images = %s,
                           qty_videos = %s
                     WHERE text_id = %s
                    """
                    cursor.execute(query, [qty_images, qty_videos, text.id])

                self.pbar_increment()
            self.pbar_destroy()

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #14

0

파일 보기

    def handle(self, *args, **options):

        queryset = Newsitem.objects
        self.pbar_setup(maxval=queryset.count())
        newsitems = queryset_iterator(queryset.all(), chunksize=100)

        for newsitem in newsitems:
            newsitem.cat1 = random.randint(1, 5)
            newsitem.save()
            self.pbar_increment()

        self.pbar_destroy()

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #15

0

파일 보기

파일: custom_wordcount.py 프로젝트: TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        for database in self.selected_dbs:

            print "Processing database " + database

            pool = Pool()

            #print "Wordcounting newsitems"
            #self.wordcount = {}
            #
            #queryset = Newsitem.objects.using(database).select_related('text')\
            #                                           .filter(text__isnull=False)
            #queryset_iter = queryset_iterator(queryset, chunksize=25)
            #
            #self.pbar_setup(maxval=queryset.count())
            #for wordcount in pool.imap_unordered(count_words, queryset_iter):
            #    self.merge_wordcount(wordcount)
            #    self.pbar_increment()
            #self.pbar_destroy()
            #
            #output = codecs.open('wordcount_newsitems.csv', 'w', 'utf-8')
            #output.write("word,count\n")
            #
            #for word, count in self.wordcount.items():
            #    output.write("{},{}\n".format(word,count))

            print "Wordcounting comments"
            self.wordcount = {}

            queryset = Comment.objects.using(database).select_related('text')\
                                                       .filter(text__isnull=False)
            queryset_iter = queryset_iterator(queryset, chunksize=100)

            self.pbar_setup(maxval=queryset.count())
            for wordcount in pool.imap_unordered(count_words, queryset_iter):
                self.merge_wordcount(wordcount)
                self.pbar_increment()
            self.pbar_destroy()

            output = codecs.open('wordcount_comments.csv', 'w', 'utf-8')
            output.write("word,count\n")

            for word, count in self.wordcount.items():
                output.write(u"{},{}\n".format(word, count))

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #16

0

파일 보기

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)
        queryset = Newsitem.objects.all()

        self.pbar_setup(maxval=queryset.count())

        for newsitem in queryset_iterator(queryset, chunksize=10):

            shortId = newsitem.url.split('/')[-1]

            current_page = 1
            while True:
                resp = requests.get(
                    "https://api.nextgen.guardianapps.co.uk/discussion/p/{0}.json"
                    .format(shortId), {
                        'page': current_page,
                        'orderBy': 'oldest',
                        'pageSize': 100,
                        'displayThreaded': 'true',
                        'maxResponses': 1000000
                    })
                if resp.status_code == 404:
                    break
                try:
                    dejson = resp.json()
                    comment_soup = BeautifulSoup(dejson['commentsHtml'],
                                                 "html.parser")
                except MemoryError:
                    print "MemoryError when dealing with URL " + newsitem.url

                with transaction.atomic():
                    self.process_comment_ul(comment_soup.ul,
                                            newsitem,
                                            parent=None)

                current_page += 1
                if current_page > dejson['lastPage']:
                    break

            self.pbar_increment()
        self.pbar_destroy()

예제 #17

0

파일 보기

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        filters = {}
        if options['only_newsitems']:
            filters['newsitem__isnull'] = False
        elif options['only_comments']:
            filters['comment__isnull'] = False

        for database in self.selected_dbs:

            print "Processing database " + database

            algorithm_tf_naive_bayes, created = Algorithm.objects.using(database)\
                                                .get_or_create(name="tf_naive_bayes")

            print "Removing previous results"
            query = """
            DELETE FROM result WHERE algorithm_id IN (
                SELECT id
                  FROM algorithm 
                 WHERE name = 'tf_naive_bayes'
            )"""
            cursor = connections[database].cursor()
            cursor.execute(query)

            print "Querying database"
            queryset = Text.objects.using(database).filter(**filters)

            results = []

            self.pbar_setup(maxval=queryset.count())
            queryset_iter = queryset_iterator(queryset, chunksize=2000)

            print "Calculating..."

            bulk_create_process = None
            pool = Pool()

            for text, result in pool.imap_unordered(estimate, queryset_iter):
                results.append(
                    Result(algorithm=algorithm_tf_naive_bayes,
                           text=text,
                           value=str(result[0])))
                self.pbar_increment()

                if len(results) >= 10000:
                    if bulk_create_process and bulk_create_process.is_alive():
                        bulk_create_process.join()
                    bulk_create_process = Process(target=bulk_create,
                                                  kwargs={
                                                      'database': database,
                                                      'results':
                                                      copy.copy(results)
                                                  })
                    bulk_create_process.start()
                    results = []

            self.pbar_destroy()

            print "Saving results"
            Result.objects.using(database).bulk_create(results)

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #18

0

파일 보기

파일: anew_calculations.py 프로젝트: TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        filters = {}
        if options['only_newsitems']:
            filters['newsitem__isnull'] = False
        elif options['only_comments']:
            filters['comment__isnull'] = False

        bulk_create_process = None

        for database in self.selected_dbs:

            print "Processing database " + database

            algorithm_anew_valence, _ = Algorithm.objects.using(database)\
                                    .get_or_create(name="anew_valence")
            algorithm_anew_arousal, _ = Algorithm.objects.using(database)\
                                                .get_or_create(name="anew_arousal")
            algorithm_anew_dominance, _ = Algorithm.objects.using(database)\
                                                .get_or_create(name="anew_dominance")

            queryset = Text.objects.using(database).filter(**filters)

            print "Removing previous results"
            query = """
            DELETE FROM result WHERE algorithm_id IN (
                SELECT id
                  FROM algorithm 
                 WHERE name IN ('anew_valence',  
                                'anew_arousal', 
                                'anew_dominance')
            )"""
            cursor = connections[database].cursor()
            cursor.execute(query)
            results = []

            queryset_iter = queryset_iterator(queryset, chunksize=10000)

            print "Calculating..."
            self.pbar_setup(maxval=queryset.count())

            pool = Pool()

            for text, work_results in pool.imap_unordered(work, queryset_iter):

                if work_results != None:
                    results.append(
                        Result(algorithm=algorithm_anew_arousal,
                               text=text,
                               value=work_results['arousal']))
                    results.append(
                        Result(algorithm=algorithm_anew_valence,
                               text=text,
                               value=work_results['valence']))
                    results.append(
                        Result(algorithm=algorithm_anew_dominance,
                               text=text,
                               value=work_results['dominance']))

                if len(results) > 10000:
                    if bulk_create_process and bulk_create_process.is_alive():
                        bulk_create_process.join()
                    bulk_create_process = Process(target=bulk_create,
                                                  kwargs={
                                                      'database': database,
                                                      'results':
                                                      copy.copy(results)
                                                  })
                    bulk_create_process.start()
                    results = []
                self.pbar_increment()

            self.pbar_destroy()

            print "Saving last results..."
            Result.objects.using(database).bulk_create(results)
            results = []

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #19

0

파일 보기

파일: test_migration_2.py 프로젝트: TheResearchProject/CommentParser

 def handle(self, *args, **options):        
     self.stdout.write('Command started')      
     super(Command, self).handle(self, *args, **options)
     
     self.stdout.write("Connecting manually to target database")
     targetdb_conn = MySQLdb.connect(
         host  = settings.GENERAL_DATA_DB_DETAILS['HOST'],
         user  = settings.GENERAL_DATA_DB_DETAILS['USER'],
         passwd= settings.GENERAL_DATA_DB_DETAILS['PASSWORD'],
         db    = settings.GENERAL_DATA_DB_DETAILS['NAME'],
         port  = settings.GENERAL_DATA_DB_DETAILS['PORT'],
         charset = 'utf8',
         use_unicode = True                               
     )                   
     targetdb_cursor = targetdb_conn.cursor()
     
     select_query = """
         SELECT * FROM author_analysis WHERE newssite = %s        
     """                          
     
     for database in self.selected_dbs:                      
         self.stdout.write("Processing database "+database)
         
         self.stdout.write("Reading authors")   
         self.authors = {}
         queryset = Author.objects.using(database).all()
         if queryset.count() > 0:
             self.stdout.write("Reading authors")   
             self.pbar_setup(maxval=queryset.count())
             queryset_iter = queryset_iterator(queryset, chunksize=1000)
             for author in queryset_iter:
                 self.authors[author.name] = author
                 self.pbar_increment()
             self.pbar_destroy()            
         
         self.stdout.write("Moving data...")  
         targetdb_cursor.execute(select_query, [database])
         
         self.pbar_setup(maxval=targetdb_cursor.rowcount)
         
         results = []
         
         for row in dictfetch(targetdb_cursor):  
             if row['author'] in self.authors:
                 le_author = self.authors[row['author']]
             else:
                 le_author = Author.objects.using(database).create(
                     name = row['author']
                 )
                 self.authors[item.idauthor] = le_author                     
             results.append(AuthorSummary(
                 author = le_author,
                 avg_nr_words = row['avg_nr_words'],
                 avg_wordlen = row['avg_wordlen'],
                 avg_words_gt6 = row['avg_words_gt6'],
                 avg_personal = row['avg_personal'],
                 avg_collective = row['avg_collective'],
                 indegree = row['indegree'],
                 indegree_centrality = row['indegree_centrality'],
                 outdegree = row['outdegree'],
                 outdegree_centrality = row['outdegree_centrality'],
                 degree = row['degree'],
                 degree_centrality = row['degree_centrality'],
                 avg_shared = row['avg_shared'],
                 pagerank = row['pagerank'],
                 pagerank_weighted = row['pagerank_weighted'],
                 nr_posts = row['nr_posts'],
                 hub_score = row['hub_score'],
                 authority_score = row['authority_score'],
                 betweeness_centrality = row['betweeness_centrality'],
                 closeness_centrality = row['closeness_centrality'],
                 clustering_coef = row['clustering_coef'],
                 eccentricity = row['eccentricity'],
                 constraint = row['constraint'],
                 polarity_arousal = row['polarity_arousal'],
                 polarity_valence = row['polarity_valence'],                    
             ))
             if len(results) >= 1000:
                 AuthorSummary.objects.using(database).bulk_create(results)
                 results = []
             self.pbar_increment()
         self.pbar_destroy()  
         
         if len(results) > 0:
             AuthorSummary.objects.using(database).bulk_create(results)
         
     self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #20

0

파일 보기

    def handle(self, *args, **options):

        k = 5

        print "Reading data to memory"

        TFIDF_list = []
        label = []

        queryset = Newsitem.objects.all()
        self.pbar_setup(maxval=queryset.count())
        newsitems = queryset_iterator(queryset, chunksize=100)

        for newsitem in newsitems:
            TFIDF_list.append(utils.TFIDF(utils.tokenize(newsitem.text.text)))
            if newsitem.cat1 in [1, 2]:
                label.append(1)
            else:
                label.append(0)
            self.pbar_increment()

        self.pbar_destroy()

        print "Creating traing and test data..."

        TFIDF_svm = []
        for i in TFIDF_list:
            TFIDF_svm.append(utils.TFIDF_to_list(i))
        # TFIDF_svm is the input matrix of SVM

        # Reads the train_len from command line
        #train_len=int(sys.argv[1])
        train_len = 200

        # Index of train samples from class 0
        indexZero = [i for i in range(len(label)) if label[i] == 0][:train_len]
        # Index of train samples from class 1
        indexOne = [i for i in range(len(label)) if label[i] == 1][:train_len]
        # We have K number of positive samples and also K number of negative samples

        train = []
        train_label = []
        for i in indexZero + indexOne:
            train.append(TFIDF_svm[i])
            train_label.append(label[i])
        # Train: train matrix
        # train_label: lables of train data

        # The other samples are test samples.
        test = [
            TFIDF_svm[i] for i in range(len(TFIDF_svm))
            if i not in indexZero + indexOne
        ]
        test_label = [
            label[i] for i in range(len(label))
            if i not in indexZero + indexOne
        ]

        print "Fitting..."
        clf = svm.SVC(probability=True)
        # Train the model
        clf.fit(train, train_label)

        #print "Score: " + clf.score(train, train_label)

        print "Generating probabilities"
        pred_probas = clf.predict_proba(test)[:, 1]

        fpr, tpr, _ = roc_curve(test_label, pred_probas)
        roc_auc = auc(fpr, tpr)

        print "Plotting..."
        plt.plot(fpr, tpr, label='area = %.2f' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.legend(loc='lower right')

        print "Saving!"
        plt.savefig('out.png')

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #21

0

파일 보기

파일: algo_command.py 프로젝트: TheResearchProject/CommentParser

    def handle(self, *args, **options):

        TFIDF_list = []
        label = []

        queryset = Newsitem.objects
        self.pbar_setup(maxval=queryset.count())
        newsitems = queryset_iterator(queryset.all(), chunksize=100)

        for newsitem in newsitems:
            TFIDF_list.append(self.tokenize(newsitem.text.text))
            if newsitem.cat1 in [1, 2]:
                label.append(1)
            else:
                label.append(0)
            self.pbar_increment()

        self.pbar_destroy()

        self.train()

        print "Estimating..."
        self.pbar_setup(maxval=len(TFIDF_list))

        counter1 = 0
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        while counter1 < len(TFIDF_list):
            distance_list = []
            counter2 = 0
            while counter2 < len(TFIDF_list):
                if counter1 != counter2:
                    distance_list.append(
                        utils.TFIDF_distance(TFIDF_list[counter1],
                                             TFIDF_list[counter2]))
                counter2 += 1
            nearest_list = sorted(range(len(distance_list)),
                                  key=lambda i: distance_list[i])[:k]
            repeat_dic = {}
            for i in nearest_list:
                if repeat_dic.has_key(label[i]):
                    repeat_dic[label[i]] += 1
                else:
                    repeat_dic[label[i]] = 1
            estimate_label = max(repeat_dic, key=repeat_dic.get)
            if estimate_label == 1 and label[counter1] == 1:
                TP += 1
            elif estimate_label == 1 and label[counter1] == 0:
                FN += 1
            elif estimate_label == 0 and label[counter1] == 0:
                TN += 1
            else:
                FP += 1
            counter1 += 1
            self.pbar_increment()

        self.pbar_destroy()

        data = [
            ('algo_knn_tp', TP),
            ('algo_knn_fn', FN),
            ('algo_knn_fp', FP),
            ('algo_knn_tn', TN),
        ]

        print 'TP=>', TP, 'FN=>', FN, 'FP=>', FP, 'TN=>', TN
        #print 'F1 Measurement: ', float(TP+TN)/(TP+FN+FP+TN), float(TP)/(TP+FP), float(TP)/(TP+FN), TP, FN, FP, TN

        print "Saving algorithm results"

        for item in data:
            algorithm_name = item[0]
            value = item[1]
            algorithm, create = Algorithm.objects.get_or_create(
                name=algorithm_name)
            result, created = Result.objects.get_or_create(algorithm=algorithm)
            result.value = str(value)
            result.save()

        algo_knn_uniform_estimative, create = Algorithm.objects.get_or_create(
            name="algo_knn_uniform_estimative")

        print "Calculating estimatives and saving result"

        queryset = Newsitem.objects
        self.pbar_setup(maxval=queryset.count())
        newsitems = queryset_iterator(queryset.all(), chunksize=100)

        for newsitem in newsitems:
            data = utils.TFIDF(utils.tokenize(row[0]))
            distance_list = []
            for i in range(len(TFIDF_list)):
                distance_list.append(utils.TFIDF_distance(data, TFIDF_list[i]))
            nearest_list = sorted(range(len(distance_list)),
                                  key=lambda i: distance_list[i])[:k]
            repeat_dic = {}
            for i in nearest_list:
                if distance_list[i] != 0:
                    if repeat_dic.has_key(label[i]):
                        repeat_dic[label[i]] += 1
                    else:
                        repeat_dic[label[i]] = 1
            estimate = max(repeat_dic, key=repeat_dic.get)
            Result.objects.create(algorithm=algo_knn_uniform_estimative,
                                  text=newsitem.text,
                                  value=str(estimate))
            self.pbar_increment()

        self.pbar_destroy()

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #22

0

파일 보기

 def get_iterator(self):
     return itertools.chain.from_iterable([
         queryset_iterator(queryset, chunksize=self.chunksize)
         for queryset in self.querysets
     ])

예제 #23

0

파일 보기

 def handle(self, *args, **options):
     super(Command, self).handle(self, *args, **options)
     
     citations_re = re.compile("\".*\"")
     
     for database in self.selected_dbs:    
         
         print "Processing database " + database
         cursor = connections[database].cursor()
         
         algorithm_list = ['afinn', 
                           'TG_KNN3_TFIDF', 
                           'TG_DT_TFIDF', 
                           'TG_NB_TFIDF'] 
                           
         queryset = Algorithm.objects.using(database).filter(
             name__in = algorithm_list     
         )
         
         comments_score_query = """
             SELECT newsitem.id nid, comment.id, result.value
               FROM newsitem, comment, text, result
              WHERE comment.NewsItemID = newsitem.id
                AND comment.text_id = text.id
                AND text.id = result.text_id
                AND result.algorithm_id = %s      
              ORDER BY nid, value
         """             
         
         insert_summary_query = """
             INSERT INTO `newsitem_pos_neg_comments`(`newsitem_id`, 
                                                     `algorithm_id`,
                                                     `pos_comments`,
                                                     `neg_comments`,
                                                     `neutral_comments`) 
             VALUES (%s,%s,%s,%s,%s)
         """
         
         insert_detail_query = """
             INSERT INTO `pos_neg_comments`(`algorithm_id`,
                                            `newsitem_id`, 
                                            `comment_id`,
                                            `sentiment`) 
             VALUES (%s,%s,%s,%s)
         """            
         
         cursor = connections[database].cursor()
         
         for algorithm in queryset_iterator(queryset, chunksize=1000):
             
             print "Processing algorithm {}".format(algorithm.name)
             
             cursor.execute(comments_score_query, [algorithm.id])
             
             current_newsitem = 0
             type_hash = Counter({POSITIVE_COMMENT: 0,
                                  NEGATIVE_COMMENT: 0,
                                  NEUTRAL_COMMENT: 0})
         
             with transaction.atomic():
                 self.pbar_setup(maxval=cursor.rowcount)
                 insert_cursor = connections[database].cursor()
                 detail_rows_to_insert = []
                 summary_rows_to_insert = []
                 for row in dictfetch(cursor):
                    
                     if row['nid'] != current_newsitem:
                         if current_newsitem > 0:
                             summary_rows_to_insert.append([
                                 current_newsitem,
                                 algorithm.id,
                                 type_hash[POSITIVE_COMMENT],
                                 type_hash[NEGATIVE_COMMENT],
                                 type_hash[NEUTRAL_COMMENT],
                             ])
                         current_newsitem = row['nid']
                         type_hash = Counter({POSITIVE_COMMENT: 0,
                                              NEGATIVE_COMMENT: 0,
                                              NEUTRAL_COMMENT: 0})
                     else:
                         comment_type = self.check_result(algorithm, 
                                                          row['value'])
                         type_hash[comment_type] += 1
                         detail_rows_to_insert.append([
                             algorithm.id,
                             row['nid'],
                             row['id'],
                             comment_type
                         ])
                         if len(detail_rows_to_insert) >= 10000:
                             insert_cursor.executemany(insert_detail_query, 
                                                       detail_rows_to_insert)
                             detail_rows_to_insert = []
                 
                     self.pbar_increment()
                 if current_newsitem > 0:
                     summary_rows_to_insert.append([
                         current_newsitem,
                         algorithm.id,
                         type_hash[POSITIVE_COMMENT],
                         type_hash[NEGATIVE_COMMENT],
                         type_hash[NEUTRAL_COMMENT],
                     ])  
                 insert_cursor.executemany(insert_summary_query, 
                                           summary_rows_to_insert) 
                 self.pbar_destroy()                  
     
     self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #24

0

파일 보기

파일: svm.py 프로젝트: TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        k = 5

        print "Reading data to memory"

        TFIDF_list = []
        label = []

        queryset = Newsitem.objects.all()
        self.pbar_setup(maxval=queryset.count())
        newsitems = queryset_iterator(queryset, chunksize=100)

        for newsitem in newsitems:
            TFIDF_list.append(utils.TFIDF(utils.tokenize(newsitem.text.text)))
            if newsitem.cat1 in [1, 2]:
                label.append(1)
            else:
                label.append(0)
            self.pbar_increment()

        self.pbar_destroy()

        print "Training..."

        TFIDF_svm = []
        for i in TFIDF_list:
            TFIDF_svm.append(utils.TFIDF_to_list(i))
        # TFIDF_svm is the input matrix of SVM

        # Reads the train_len from command line
        #train_len=int(sys.argv[1])
        train_len = 200

        # Index of train samples from class 0
        indexZero = [i for i in range(len(label)) if label[i] == 0][:train_len]
        # Index of train samples from class 1
        indexOne = [i for i in range(len(label)) if label[i] == 1][:train_len]
        # We have K number of positive samples and also K number of negative samples

        train = []
        train_label = []
        for i in indexZero + indexOne:
            train.append(TFIDF_svm[i])
            train_label.append(label[i])
        # Train: train matrix
        # train_label: lables of train data

        # The other samples are test samples.
        test = [
            TFIDF_svm[i] for i in range(len(TFIDF_svm))
            if i not in indexZero + indexOne
        ]
        test_label = [
            label[i] for i in range(len(label))
            if i not in indexZero + indexOne
        ]

        clf = svm.SVC()
        # Train the model
        clf.fit(train, train_label)

        counter1 = 0
        TP = 0
        TN = 0
        FP = 0
        FN = 0

        print "Estimating..."
        self.pbar_setup(maxval=len(test))

        for i in test:
            estimate_label = clf.predict([i])[0]
            if estimate_label == 1 and label[counter1] == 1:
                TP += 1
            elif estimate_label == 1 and label[counter1] == 0:
                FN += 1
            elif estimate_label == 0 and label[counter1] == 0:
                TN += 1
            else:
                FP += 1
            counter1 += 1
            self.pbar_increment()

        self.pbar_destroy()

        print 'TP=>', TP, 'FN=>', FN, 'FP=>', FP, 'TN=>', TN
        #print 'F1 Measurement: ', float(TP+TN)/(TP+FN+FP+TN), float(TP)/(TP+FP), float(TP)/(TP+FN), TP, FN, FP, TN

        data = [
            ('algo_svm_tp', TP),
            ('algo_svm_fn', FN),
            ('algo_svm_fp', FP),
            ('algo_svm_tn', TN),
            ('algo_svm_score', clf.score(train, train_label)),
        ]

        print "Saving algorithm results"

        for item in data:
            algorithm_name = item[0]
            value = item[1]
            algorithm, create = Algorithm.objects.get_or_create(
                name=algorithm_name)
            result, create = Result.objects.get_or_create(algorithm=algorithm)
            result.value = str(value)
            result.save()

        algo_svm_estimative, create = Algorithm.objects.get_or_create(
            name="algo_svm_estimative")

        print "Calculating estimatives and saving result"

        queryset = Newsitem.objects
        self.pbar_setup(maxval=queryset.count())
        newsitems = queryset_iterator(queryset.all(), chunksize=100)

        for newsitem in newsitems:
            estimate = clf.predict([
                utils.TFIDF_to_list(
                    utils.TFIDF(utils.tokenize(newsitem.text.text)))
            ])
            Result.objects.create(algorithm=algo_svm_estimative,
                                  text=newsitem.text,
                                  value=str(estimate[0]))
            self.pbar_increment()

        self.pbar_destroy()

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #25

0

파일 보기

 def handle(self, *args, **options):
     super(Command, self).handle(self, *args, **options)
     
     if len(self.selected_dbs) != 1:
         self.stdout.write(self.style.ERROR('You need to select exactly one database for this command'))
         return
         
     selected_db = self.selected_dbs[0]
         
     nodes = Element('nodes')
     edges = Element('edges')
     
     self.stdout.write("Processing NewsItems")
     
     queryset = Newsitem.objects.using(selected_db).only('id').all()
     self.pbar_setup(maxval=queryset.count())
     
     for newsitem in queryset_iterator(queryset, chunksize=10000):
         nodes.append(Element('node', attrib={
             'id': "N{}".format(newsitem.id),
             'label': "Newsitem {}".format(newsitem.id),
         }))
         
         self.pbar_increment()
 
     self.pbar_destroy()       
     
     self.stdout.write("Processing Comments")
     
     queryset = Comment.objects.using(selected_db)\
                               .only('id', 'parent_id', 'newsitem_id')\
                               .all()
     edge_id = 0
     self.pbar_setup(maxval=queryset.count())
     
     for comment in queryset_iterator(queryset, chunksize=10000):
         comment_id = "C{}".format(comment.id) 
         nodes.append(Element('node', attrib={
             'id': comment_id,
             'label': "Comment {}".format(comment.id),
         }))
         
         if comment.parent_id:
             target_id = comment_id
         else:
             target_id = "N{}".format(comment.newsitem_id)
             
         edges.append(Element('edge', attrib={
             'id': str(edge_id),
             'source': comment_id,
             'target': target_id,
             'type': 'directed'
         }))
         
         edge_id += 1
         
         self.pbar_increment()
 
     self.pbar_destroy()       
     
     self.stdout.write("Finishing...")
         
     graph = Element('graph', attrib={
         'mode': 'dynamic',
         'defaultedgetype': 'directed',
     })
     
     graph.append(nodes)
     graph.append(edges)
     
     meta = Element('meta', attrib={
         'lastmodifieddate': datetime.date.today().isoformat()
     })
     
     meta.append(Element('creator', text="Iris Steenhout"))
     meta.append(Element('description', text="Newsitems and it's comments."))
     
     gexf = Element('gexf', attrib={
         'xmlns': 'http://www.gexf.net/1.2draft',
         'version': '1.2',
     })
     
     gexf.append(meta)
     gexf.append(graph)
     
     self.stdout.write("Writing...")
     
     options['output'].write(tostring(gexf, encoding="UTF-8"))
     options['output'].close()
     
     self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #26

0

파일 보기

파일: vlad_tokenize.py 프로젝트: TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        lem = WordNetLemmatizer()

        stopwords = {}
        with open('stopwords.txt', 'rU') as f:
            for line in f:
                stopwords[line.strip()] = 1

        for database in self.selected_dbs:

            print "Processing database " + database

            algorithm_tokenize, created = Algorithm.objects.using(
                database).get_or_create(name="vlad_tokenize")

            word_cache = WordCache()

            print "Excluding previous results"
            #Exclude all tokens and create them again
            Result.objects.using(database).filter(
                algorithm=algorithm_tokenize).delete()
            Word.objects.using(database).all().delete()
            print "Reading comments"
            non_tokenized_comments = Comment.objects.using(database).filter(
                text__isnull=False)
            total_comments = non_tokenized_comments.count()
            if total_comments == 0:
                print "No items to tokenize!"
                return
            comments = queryset_iterator(non_tokenized_comments.all(),
                                         chunksize=100)

            text_word = []

            print "Startint process!"
            self.pbar_setup(maxval=total_comments)

            for comment in comments:
                tokens = nltk.word_tokenize(self.clean_text(comment.text.text))
                text = [word for word in tokens if word not in stopwords]
                tagged_text = nltk.pos_tag(text)

                for word_text, tag in tagged_text:
                    if tag in ['NN', 'NNS']:
                        word_id = word_cache.get(word_text)
                        if not word_id:
                            word_attributes = {
                                'word': word_text,
                                'tag': tag,
                                'noun': lem.lemmatize(word_text)
                            }
                            word_id = word_cache.save(word_attributes)
                        text_word.append((word_id, comment.text.id))

                self.pbar_increment()

            self.pbar_destroy()

            print "Bulk creating words"
            Word.objects.using(database).bulk_create(
                [Word(**attrib) for attrib in word_cache.get_as_list()])

            print "Opening cursor"
            cursor = connections[database].cursor()
            query = 'INSERT IGNORE INTO word_texts (word_id, text_id) VALUES (%s, %s)'

            print "Inserting rows for relation word-text..."
            self.pbar_setup(maxval=len(text_word))
            while len(text_word) > 0:
                chunk = []
                while len(chunk) < 10000:
                    try:
                        chunk.append(text_word.pop())
                        self.pbar_increment()
                    except IndexError:
                        break

                cursor.executemany(query, chunk)
                transaction.commit(database)

            self.pbar_destroy()

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #27

0

파일 보기

파일: vlad_topics.py 프로젝트: TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        #Create temporary directory to write the corpus LDA-C files
        temp_dir_path = tempfile.mkdtemp()
        corpus_path = temp_dir_path + "/corpus.lda-c"

        lda_num_topics = 50

        for database in self.selected_dbs:

            print "Processing database " + database

            #Building dictionary
            print "Building dictionary"
            dictionary = Dictionary()

            queryset = Comment.objects.using(database).exclude(
                text__isnull=True)
            self.pbar_setup(maxval=queryset.count())

            for comment in queryset_iterator(queryset, chunksize=50):
                dictionary.add_documents(
                    [[word.word for word in comment.text.word_set.all()]])
                self.pbar_increment()
            self.pbar_destroy()

            dictionary.filter_extremes(keep_n=10000)
            dictionary.compactify()

            #Serialize corpus
            print "Serializing corpus"
            corpus = Corpus(
                queryset_iterator(Comment.objects.using(database).all(),
                                  chunksize=50), dictionary)
            BleiCorpus.serialize(corpus_path, corpus, id2word=dictionary)

            #Train
            print "Training..."
            bleicorpus = BleiCorpus(corpus_path)
            lda = gensim.models.LdaModel(bleicorpus,
                                         num_topics=lda_num_topics,
                                         id2word=dictionary)

            #Saving
            print "Saving results to DB"
            lda_db_obj, created = Algorithm.objects.using(
                database).get_or_create(name='LDA')
            #Removing previous results
            lda_db_obj.result_set.all().delete()
            #Looping through results and saving to DB
            i = 0
            for topic in lda.show_topics(num_topics=lda_num_topics):
                Result.objects.using(database).create(sequence=i,
                                                      value=str(topic),
                                                      algorithm=lda_db_obj)
                i += 1

            #Remove temporary directory
            #Check first if it's not the current working directory, as removing it
            #  would be a disaster! ;)
            if os.getcwd() != temp_dir_path:
                #Just remove it if it's a temp dir
                shutil.rmtree(temp_dir_path)
            else:
                #If it's the current working directory, just remove the uneeded files
                map(os.remove, glob.glob('corpus.lda-c*'))

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

예제 #28

0

파일 보기

파일: test_migration.py 프로젝트: TheResearchProject/CommentParser

    def handle(self, *args, **options):
        self.stdout.write('Command started')
        super(Command, self).handle(self, *args, **options)

        #self.edges = []
        #
        #self.stdout.write("Connecting manually to target database")
        #targetdb_conn = MySQLdb.connect(
        #    host  = settings.GENERAL_DATA_DB_DETAILS['HOST'],
        #    user  = settings.GENERAL_DATA_DB_DETAILS['USER'],
        #    passwd= settings.GENERAL_DATA_DB_DETAILS['PASSWORD'],
        #    db    = settings.GENERAL_DATA_DB_DETAILS['NAME'],
        #    port  = settings.GENERAL_DATA_DB_DETAILS['PORT'],
        #    charset = 'utf8',
        #    use_unicode = True
        #)
        #targetdb_cursor = targetdb_conn.cursor()
        #
        #select_query = """
        #    SELECT * FROM author_analysis WHERE newssite = %s
        #"""

        for database in self.selected_dbs:
            self.stdout.write("Processing database " + database)

            self.authors = {}
            queryset = Author.objects.using(database).all()
            if queryset.count() > 0:
                self.stdout.write("Reading authors")
                self.pbar_setup(maxval=queryset.count())
                queryset_iter = queryset_iterator(queryset, chunksize=1000)
                for author in queryset_iter:
                    self.authors[author.name] = author
                    self.pbar_increment()
                self.pbar_destroy()

            for model in [Comment, Newsitem]:

                if model == Comment:
                    model_name = "Comment"
                    filters = {
                        'authorshortname__isnull': False,
                        'author__isnull': True
                    }
                else:
                    model_name = "Newsitem"
                    filters = {
                        'idauthor__isnull': False,
                        'author__isnull': True
                    }
                self.stdout.write("Processing model " + model_name)

                queryset = model.objects.using(database)\
                                        .filter(**filters)

                self.stdout.write('Linking Authors with ' + model_name)

                if queryset.count() == 0:
                    self.stdout.write('No authors to link')
                    continue
                self.pbar_setup(maxval=queryset.count())
                queryset_iter = queryset_iterator(queryset, chunksize=1000)

                for item in queryset_iter:
                    if model == Comment:
                        if item.authorshortname in self.authors:
                            item.author = self.authors[item.authorshortname]
                        else:
                            le_author = Author.objects.using(database).create(
                                name=item.authorshortname,
                                long_name=item.authorid)
                            item.author = le_author
                            self.authors[item.authorshortname] = le_author
                    else:
                        if item.idauthor in self.authors:
                            item.author = self.authors[item.idauthor]
                        else:
                            le_author = Author.objects.using(database).create(
                                name=item.idauthor)
                            item.author = le_author
                            self.authors[item.idauthor] = le_author
                    item.save()
                    self.pbar_increment()
                self.pbar_destroy()

                bulk_update(queryset,
                            batch_size=1000,
                            using=database,
                            update_fields=['author'])

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))