Exemple #1
0
class ProcessingScheduler:
    def __init__(self):
        self.processing_queue = deque()
        self.actively_processing = {}
        self.mongo = MongoDatabase()

    def add_video(self, request: dict = {}):
        '''Add a video to the processing queue.

        @request: dictionary containing the video filename and specified options.
        '''

        self.processing_queue.append(request)
        if not self.actively_processing:
            self.actively_processing = self.processing_queue.popleft()
            self.process()

    def get_status(self):
        '''Get the status of each video in the processing pipeline.'''
        status = {el['filename']: 'Waiting...' for el in self.processing_queue}
        status[self.actively_processing['filename']] = 'Processing'

        return status

    def process(self):
        '''Process a video.'''
        print(f'Processing video {self.actively_processing["filename"]}...')
        print('Add processing code here.......')
        self.mongo.add_processed_video(self.actively_processing)
        self.actively_processing = {}
Exemple #2
0
    def __init__(self):
        super().__init__()

        self.ui = mainwindow.Ui_Frec()
        self.ui.setupUi(self)
        self.db = MongoDatabase()

        self.init_frec()
Exemple #3
0
    def __init__(self):

        logging.info('Initialized {}'.format(self.__class__.__name__))

        # instantiate database
        self.db = MongoDatabase()

        # set utf8 encoding
        reload(sys)
        sys.setdefaultencoding('utf8')
Exemple #4
0
    def __init__(self):

        logging.info('Initialized {}'.format(self.__class__.__name__))

        # instantiate database
        self.db = MongoDatabase()

        # location to store plots
        self.plot_save_folder = os.path.join('files', 'plots')

        # location to store tables to
        self.table_save_folder = os.path.join('files', 'tables')
Exemple #5
0
    def __init__(self):
        super().__init__()

        self.ui = mainwindow.Ui_Frec()
        self.ui.setupUi(self)
        self.db = MongoDatabase()

        self.init_frec()
        self.is_local_save = False

        self.dirs = config.Dirs()

        self.setWindowIcon(QIcon(self.dirs.appicon))
Exemple #6
0
	def confirmDB(self):
		print 'In confirmDB'
		# global path, thumbPath, fullImgPath, self.running
		#print '\n\n' + str(path) + '\n\n'
		self.thumbPath=self.path + 'imgThumb'
		self.fullImgPath=self.path + 'imgFull'
		if not os.path.exists(self.path): getPath(self)
		if not os.path.exists(self.thumbPath): os.makedirs(self.thumbPath)
		if not os.path.exists(self.fullImgPath): os.makedirs(self.fullImgPath)

		### Initialize slide db ###
	#	print db.collection_names()
		if u'slideColl' not in MongoDatabase.db.collection_names():
			print '1'
			MongoDatabase.initSlides(self)
		else:
			print '2'
			MongoDatabase.updateSlides(self)
Exemple #7
0
    def __init__(self):

        self.components = []
        self.database = MongoDatabase("localhost")

        self.load_components()
Exemple #8
0
# Specify the port number.
PORT = 1496

# Directory to save videos to.
ROOT_DIR = os.getcwd()
SAVE_DIR = f'{os.getcwd()}/videos'

# Create the server.
app = Flask(__name__)
api = Api(app)
valid_headers = ['Content-Type', 'Access-Control-Allow-Origin', '*']
cors = CORS(app, allow_headers=valid_headers)

# Connect to the database.
mongo = MongoDatabase()

# Video processing.
processor = Processor()

# Do a little server-side checking.
ALLOWED_EXTENSIONS = set(['webm', 'mp4', 'mp3', 'wav', 'jpeg', 'gif', 'png'])

# -------------------------------------------------------------
# Global functions.


def allowed_file(filename):
    '''Ensure we want to keep this file.'''
    return True
Exemple #9
0
from datetime import datetime
from database import MongoDatabase

db = MongoDatabase()
experiments = db.experiments

format = '%Y-%m-%d'

available_times = []

for a in range(1):

    dish = list(db.experiments.find({'dishNumber': str(a + 1)}))

    for b in range(1):

        start_date = list(dish)[b]['startDate']
        start_experiment = start_date

        end_date = list(dish)[b]['endDate']
        end_experiment = end_date

        start_experiment = datetime.strptime(start_experiment, format)
        end_experiment = datetime.strptime(end_experiment, format)

        start_experiment = int(datetime.timestamp(start_experiment))
        end_experiment = int(datetime.timestamp(end_experiment))

        print(start_experiment, type(start_experiment))
        print(end_experiment, type(end_experiment))
    return tweet


"""
	Script starts here
"""

if __name__ == "__main__":

    # create logging to console
    set_logger()

    logging.info('Start: {} '.format(__file__))

    # create database connection
    db = MongoDatabase()

    # instantiate twitter object
    twitter = Twitter(key=API_KEY, secret=API_SECRET)

    # create connection
    twitter.connect_to_API()

    if get_sanders_tweets:
        """
			The Sanders dataset consists out of 5,513 hand classified tweets related to the topics Apple (@Apple), Google (#Google), Microsoft (#Microsoft), and Twitter (#Twitter). Tweets were 
			classified as positive, neutral, negative, or irrelevant; the latter referring to non-English tweets which we discarded. The Sanders dataset has been used for boosting Twitter 
			sentiment classification using different sentiment dimensions, combining automatically and hand-labeled twitter sentiment labels, and combining community detection and sentiment 
			analysis}. The dataset is available from http://www.sananalytics.com/lab/.

			Tweets are saved into the collection 'sanders_tweets_raw'
Exemple #11
0
class Frec(QMainWindow):
    def __init__(self):
        super().__init__()

        self.ui = mainwindow.Ui_Frec()
        self.ui.setupUi(self)
        self.db = MongoDatabase()

        self.init_frec()

    def init_frec(self):
        self.ui.btn_register.clicked.connect(self.save_new_member)
        self.ui.btn_delete.clicked.connect(self.delete_member)

        self.ui.btn_clear.clicked.connect(self.clear_form)
        self.ui.btn_export.clicked.connect(self.export_cvs)
        self.ui.btn_import.clicked.connect(self.import_cvs)

        self.ui.btn_connectDb.clicked.connect(self.connect_db)

        self.connect_db()
        self.show_member_at_tableWidget()

    def show_message(self, msg, timeout=3000):
        self.ui.statusBar.showMessage(msg, timeout)

    def connect_db(self):
        dbName = self.ui.lne_dbName.text()
        if self.db.connect(dbName):
            self.show_message("Database connection with '" + dbName +
                              "' is successful.")
            self.ui.lbl_dbConnection.setText("Database : Connected '" +
                                             dbName + "'")
            self.ui.lbl_dbConnection.setStyleSheet('color: green')
        else:
            self.ui.lbl_dbConnection.setText("Database : Disconnected")
            self.ui.lbl_dbConnection.setStyleSheet('color: red')
            self.show_message("Database connection couldn't established!")
            return False

        return True

    # Save new member to database
    def save_new_member(self):

        new = {
            "firstname": self.ui.lne_firstName.text(),
            "surname": self.ui.lne_lastName.text(),
            "department": self.ui.comboBox_department.currentText(),
            "email": self.ui.lne_email.text(),
            "mobilencc": self.ui.lne_mobileCyp.text(),
            "mobileother": self.ui.lne_mobileOther.text(),
        }

        self.db.add_new_member(new['firstname'], new['surname'], new['email'],
                               new['department'], new['mobilencc'],
                               new['mobileother'])
        self.add_member_to_tableWidget(new['firstname'], new['surname'],
                                       new['email'], new['department'],
                                       new['mobilencc'], new['mobileother'])

    # show member on tableview
    def add_member_to_tableWidget(self, firstname, surname, email, department,
                                  mobilencc, mobileother):
        rowPoint = self.ui.tableWidget.rowCount()
        self.table.insertRow(rowPoint)
        self.table.setItem(rowPoint, 0, QTableWidgetItem(firstname))
        self.table.setItem(rowPoint, 1, QTableWidgetItem(surname))
        self.table.setItem(rowPoint, 2, QTableWidgetItem(email))
        self.table.setItem(rowPoint, 3, QTableWidgetItem(department))
        self.table.setItem(rowPoint, 4, QTableWidgetItem(mobilencc))
        self.table.setItem(rowPoint, 5, QTableWidgetItem(mobileother))

    def show_member_at_tableWidget(self):
        self.table = self.ui.tableWidget
        self.table.setColumnCount(6)
        self.table.setHorizontalHeaderLabels([
            "firstname", "surname", "department", "email", "mobilencc",
            "mobileother"
        ])
        quary = {}
        result = self.db.query_result_multi("Member", quary)
        for member in result:
            self.add_member_to_tableWidget(member["name"]["first"],
                                           member["name"]["last"],
                                           member["department"],
                                           member["email"],
                                           member["mobileNo"]["ncc"],
                                           member["mobileNo"]["other"])

    # Delete chosen member on tableView
    def delete_member(self):
        pass  # TODO

    # Clear inside the line edits in registration form
    def clear_form(self):
        self.ui.lne_email.setText("")
        self.ui.lne_firstName.setText("")
        self.ui.lne_lastName.setText("")
        self.ui.lne_mobileCyp.setText("")
        self.ui.lne_mobileOther.setText("")
        self.ui.comboBox_department.setCurrentIndex(0)

    # Export members in the database as csv
    def export_cvs(self):
        pass  # TODO

    # Import members from a csv file
    def import_cvs(self):
        pass  # TODO

    def create_desktop_entry(self):
        # TODO: Replace [-HOMEDIR-] inside ./data/freg.desktop and copy it to ~/.local/share/applications/
        pass
Exemple #12
0
class Preprocessing():
    def __init__(self):

        logging.info('Initialized {}'.format(self.__class__.__name__))

        # instantiate database
        self.db = MongoDatabase()

        # set utf8 encoding
        reload(sys)
        sys.setdefaultencoding('utf8')

    def full_text_preprocessing(self, pdf_folder=os.path.join('files', 'pdf')):
        """
			preprocess full-text publications
			- convert pdf to plain text
			- correct for carriage returns
			- correct for end-of-line hyphenation
			- remove boilerplate
			- remove bibliography
			- remove acknowledgements

			Parameters
			----------
			pdf_folder : os.path
				location where PDF documents are stored
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # read pdf files that need to be converted
        F = [x for x in read_directory(pdf_folder) if x[-4:] == '.pdf']

        # read documents from DB that have already been processed so we can skip them
        processed_documents = [
            '{}-{}-{}'.format(x['journal'], x['year'], x['title'])
            for x in self.db.read_collection(collection='publications_raw')
        ]

        # loop over each file and convert pdf to plain and save meta data to DB
        for i, f in enumerate(F):

            # extract meta data from folder structure and file name
            journal = f.split('/')[2]
            year = f.split('/')[3]
            title = f.split('/')[4].replace('-', ' ')[4:-4].strip()

            # console output
            print_doc_verbose(i, len(F), journal, year, title)

            # check if PDF has already been processed
            if '{}-{}-{}'.format(journal, year, title) in processed_documents:
                logging.info('PDF document already processed, skipping ...')
                continue

            # convert content of PDF to plain text
            content = pdf_to_plain(f)

            # check if content could be extracted
            if content is not None:

                # fix soft hyphen
                content = content.replace(u'\xad', "-")
                # fix em-dash
                content = content.replace(u'\u2014', "-")
                # fix en-dash
                content = content.replace(u'\u2013', "-")
                # minus sign
                content = content.replace(u'\u2212', "-")
                # fix hyphenation that occur just before a new line
                content = content.replace('-\n', '')
                # remove new lines/carriage returns
                content = content.replace('\n', ' ')

                # correct for ligatures
                content = content.replace(u'\ufb02', "fl")  # fl ligature
                content = content.replace(u'\ufb01', "fi")  # fi ligature
                content = content.replace(u'\ufb00', "ff")  # ff ligature
                content = content.replace(u'\ufb03', "ffi")  # ffi ligature
                content = content.replace(u'\ufb04', "ffl")  # ffl ligature
                """ 
					Remove boilerplate content:

					Especially journal publications have lots of boilerplate content on the titlepage. Removing of this is specific for each
					journal and you can use some regular expressions to identify and remove it.
				"""
                """
					Remove acknowledgemends and/or references
					This is a somewhat crude example
				"""
                if content.rfind("References") > 0:
                    content = content[:content.rfind("References")]
                """
				 	Remove acknowledgements
				"""
                if content.rfind("Acknowledgment") > 0:
                    content = content[:content.rfind("Acknowledgment")]

                # prepare dictionary to save into MongoDB
                doc = {
                    'journal': journal,
                    'title': title,
                    'year': year,
                    'content': content
                }

                # save to database
                self.db.insert_one_to_collection(doc=doc,
                                                 collection='publications_raw')

    def general_preprocessing(self, min_bigram_count=5):
        """
			General preprocessing of publications (used for abstracts and full-text)

			Parameters
			----------
			min_bigram_count : int (optional)
				frequency of bigram to occur to include into list of bigrams. Thus lower frequency than min_bigram_count will not be included.
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # read document collection
        D = self.db.read_collection(collection='publications_raw')

        # setup spacy natural language processing object
        nlp = setup_spacy()

        # loop through the documents and correct content
        for i, d in enumerate(D):

            # check if tokens are already present, if so, skip
            if d.get('tokens') is None:

                # print to console
                print_doc_verbose(i, D.count(), d['journal'], d['year'],
                                  d['title'])

                # get content from document and convert to spacy object
                content = nlp(d['content'])

                # tokenize, lemmatization, remove punctuation, remove single character words
                unigrams = word_tokenizer(content)

                # get entities
                entities = named_entity_recognition(content)

                # get bigrams
                bigrams = get_bigrams(" ".join(unigrams))
                bigrams = [['{} {}'.format(x[0], x[1])] * y
                           for x, y in Counter(bigrams).most_common()
                           if y >= min_bigram_count]
                bigrams = list(itertools.chain(*bigrams))

                d['tokens'] = unigrams + bigrams + entities

                # save dictionary to datbase
                self.db.update_collection(collection='publications_raw', doc=d)

            else:
                logging.debug('Document already tokenized, skipping ...')
Exemple #13
0
 def __init__(self):
     self.processing_queue = deque()
     self.actively_processing = {}
     self.mongo = MongoDatabase()
Exemple #14
0
from helper_functions import *
from database import MongoDatabase


"""
	Script starts here
"""
if __name__ == "__main__":

	# create logging to console
	set_logger()

	logging.info('Start: {} '.format(__file__))

	# create database connection
	db = MongoDatabase()

	# location of target tweets
	location_tweets = os.path.join('files', 'target_tweets')

	# modes of research
	modes_of_research = ['interdisciplinary', 'multidisciplinary','transdisciplinary']

	# process tweets for each mode of research
	for mode in modes_of_research:

		logging.info('Processing mode of research: {}'.format(mode))

		# read tweets files
		F = read_directory(os.path.join(location_tweets, mode))
	return tweet


"""
	Script starts here
"""

if __name__ == "__main__":

	# create logging to console
	set_logger()

	logging.info('Start: {} '.format(__file__))

	# create database connection
	db = MongoDatabase()

	# instantiate twitter object
	twitter = Twitter(key = API_KEY, secret = API_SECRET)

	# create connection
	twitter.connect_to_API()

	if get_sanders_tweets:

		"""
			The Sanders dataset consists out of 5,513 hand classified tweets related to the topics Apple (@Apple), Google (#Google), Microsoft (#Microsoft), and Twitter (#Twitter). Tweets were 
			classified as positive, neutral, negative, or irrelevant; the latter referring to non-English tweets which we discarded. The Sanders dataset has been used for boosting Twitter 
			sentiment classification using different sentiment dimensions, combining automatically and hand-labeled twitter sentiment labels, and combining community detection and sentiment 
			analysis}. The dataset is available from http://www.sananalytics.com/lab/.
Exemple #16
0
class Interpretation():
    def __init__(self):

        logging.info('Initialized {}'.format(self.__class__.__name__))

        # instantiate database
        self.db = MongoDatabase()

        # location to store plots
        self.plot_save_folder = os.path.join('files', 'plots')

        # location to store tables to
        self.table_save_folder = os.path.join('files', 'tables')

    def infer_document_topic_distribution(
        self,
        K=10,
        dir_prior='auto',
        random_state=42,
        num_pass=15,
        iteration=200,
        top_n_words=10,
        models_folder=os.path.join('files', 'models'),
        lda_files_folder=os.path.join('files', 'lda')):
        """
			Infer the document topic distribition per publication. The LDA model shows us the word probabilies per topic, but we also want to know what
			topics we find within each document. Here we infer such document-topic distribution and save it to the databse so we can use it later
			to plot some interesting views of the corpus

			Values for K, dir_prior, random_state, num_pass and iteratrion will become visible when plotting the coherence score. Use the model that 
			achieved the highest coherence score.

			Parameters
			-----------
			k: int
				number of topics that resulted in the best decomposition of the underlying corpora
			dir_prior: string
				dirichlet priors 'auto', 'symmetric', 'asymmetric'
			random_state: int
				seed value for random initialization
			num_pass: int
				number of passes over the full corpus
			iteration: int
				max iterations for convergence
			top_n_words: int
				only print out the top N high probability words
			models_folder: os.path
				location of created LDA models
			lda_files_folder: os.path
				location of LDA corpus and dictionary
			save_folder: os.path
				location to store the tables

		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # read dictionary and corpus
        dictionary, corpus = get_dic_corpus(lda_files_folder)

        # load LDA model according to parameters
        model = load_lda_model(
            os.path.join(models_folder, str(K), dir_prior, str(random_state),
                         str(num_pass), str(iteration)))

        # load docs
        D = self.db.read_collection(collection='publications_raw')

        # loop through all the documents to infer document-topics distribition
        for i, d in enumerate(D):

            # check if tokens are present; in case some documents couldn't properly be tokenized during pre-processing phase
            if d.get('tokens') is not None:

                # print to console
                print_doc_verbose(i, D.count(), d['journal'], d['year'],
                                  d['title'])

                # create bag of words from tokens
                bow = model.id2word.doc2bow(d['tokens'])

                # infer document-topic distribution
                topics = model.get_document_topics(bow, per_word_topics=False)

                # convert to dictionary: here we convert the topic number to string because mongodb will complain otherwise
                # you will get a message that documents can only have string keys
                dic_topics = {}
                for t in topics:
                    dic_topics[str(t[0])] = float(t[1])

                # create a new document to add to the database, this time in a different collection
                insert_doc = {
                    'journal': d['journal'],
                    'year': d['year'],
                    'title': d['title'],
                    'topics': dic_topics
                }

                # save insert_doc to database within publications collection
                self.db.insert_one_to_collection('publications', insert_doc)

    def get_document_title_per_topic(self):
        """
			Get document title per topic
			Here we obtain the publication title of the most dominant topic within that publication
			Most dominant topic is the topic proportion that is the largest
			So if document has topic A = 10%, B = 30%, and C = 60%, then C is the dominant topic
			We can use the titles for the dominant topics to get insights into the label of that topic 
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # load docs
        D = self.db.read_collection(collection='publications')

        # empty list where we can append publication titles to
        titles = []

        # loop trough all the docs
        for i, d in enumerate(D):

            # print to console
            print_doc_verbose(i, D.count(), d['journal'], d['year'],
                              d['title'])

            # get the dominant topic
            dominant_topic = max(d['topics'].iteritems(), key=itemgetter(1))
            # get the topic ID and percentage
            dominant_topic_id, dominant_topic_percentage = dominant_topic[
                0], dominant_topic[1]

            # append to list
            titles.append([
                d['year'], d['title'], d['journal'], dominant_topic_id,
                dominant_topic_percentage
            ])

        # save to CSV
        save_csv(titles, 'titles-to-topics', folder=self.table_save_folder)

    def plot_topics_over_time(self, plot_save_name='topics-over-time.pdf'):
        """
			Plot cumulative topic distribution over time

			Parameters
			----------
			plot_save_name: string
				name of the plot
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # load docs
        D = self.db.read_collection(collection='publications')

        # create dictionary where we can obtain the topic distribution per year
        year_to_topics = get_year_to_topics(D)

        # calculate the cumulative topic distribution: basically the average distribution per year
        year_to_cum_topics = get_year_to_cum_topics(year_to_topics)

        # convert dictionary to pandas dataframe
        df = pd.DataFrame.from_dict(year_to_cum_topics)

        # create the plot
        fig, axs = plt.subplots(2, 5, figsize=(15, 10))
        axs = axs.ravel()

        # loop over each row of the dataframe
        for index, row in df.iterrows():

            # get year values
            x = df.columns.values.tolist()
            # get topic proportions
            y = row.tolist()

            # add to plot
            axs[index].plot(x,
                            y,
                            'o--',
                            color='black',
                            linewidth=1,
                            label="Topic prevalence")
            axs[index].set_title(get_topic_label(index), fontsize=14)
            axs[index].set_ylim([0, 0.4])

        # save plot
        plt.savefig(os.path.join(self.plot_save_folder, plot_save_name),
                    bbox_inches='tight')
        plt.close()

    def plot_topics_over_time_stacked(
            self, plot_save_name='topics-over-time-stacked.pdf'):
        """
			Plot topics over time stacked

			Parameters
			----------
			plot_save_name: string
				name of the plot
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # load docs
        D = self.db.read_collection(collection='publications')

        # create dictionary where we can obtain the topic distribution per year
        year_to_topics = get_year_to_topics(D)

        # calculate the cumulative topic distribution: basically the average distribution per year
        year_to_cum_topics = get_year_to_cum_topics(year_to_topics)

        # convert dictionary to pandas dataframe
        df = pd.DataFrame.from_dict(year_to_cum_topics)

        # transpose dataframe
        df = df.transpose()

        # change column headers into topic labels
        df.columns = [get_topic_label(x) for x in df.columns.values]

        # plot the dataframe
        ax = df.plot(figsize=(15, 8),
                     kind='area',
                     colormap='Spectral_r',
                     rot=45,
                     grid=False)
        # set values for x-axis
        plt.xticks(df.index)
        # limit the x-axis
        plt.xlim(min(df.index), max(df.index))
        # limit the y-axis
        plt.ylim(0, 1)
        # get the legend
        handles, labels = ax.get_legend_handles_labels()
        # position it right of the figure
        plt.legend(reversed(handles),
                   reversed(labels),
                   loc='right',
                   bbox_to_anchor=(1.35, 0.50),
                   ncol=1,
                   fancybox=False,
                   shadow=False,
                   fontsize=16)

        # save plot
        plt.savefig(os.path.join(self.plot_save_folder, plot_save_name),
                    bbox_inches='tight')
        plt.close()

    def plot_topic_co_occurrence(self,
                                 plot_save_name='topic-co-occurrence.pdf'):
        """
			Plot topic co-occurrence
			
			Parameters
			----------
			plot_save_name: string
				name of the plot
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # load docs
        D = self.db.read_collection(collection='publications')

        # create empty dictionary where we can store the dominant topic id and remaining other proportions
        dominant_id_to_topics = {}

        for d in D:

            # sort topics and create list
            topics = [
                value for key, value in sorted(d['topics'].iteritems(),
                                               key=lambda x: int(x[0]))
            ]

            # get max topix id
            max_topic_id = topics.index(max(topics))

            # check if topic ID key already created
            if max_topic_id not in dominant_id_to_topics:
                dominant_id_to_topics[max_topic_id] = []

            dominant_id_to_topics[max_topic_id].append(topics)

        # create empty dictionary where we can have the cumulative topic distribution per dominant topic ID
        dominant_id_to_cum_topics = {}
        for k, v in dominant_id_to_topics.iteritems():

            # calculate mean and add to dictionary
            dominant_id_to_cum_topics[k] = np.mean(np.array(v), axis=0) * 100.

        # convert dictionary to pandas dataframe
        df = pd.DataFrame.from_dict(dominant_id_to_cum_topics)

        # change column headers into topic labels
        df.columns = [get_topic_label(x) for x in df.columns.values]
        df.index = [get_topic_label(x) for x in df.index.values]

        # create max column
        df['max'] = 0.

        # keep track of new index
        new_index = []

        # add max column so we can sort on it later
        for index, row in df.iterrows():

            # add value to max column
            df['max'][index] = max(row)
            # make self co-occurrence zero
            df[index][index] = 0.0

            # add new index names to tracker so we can rename it later
            new_index.append('{} ({}%)'.format(index, round(max(row), 2)))

        # update index name
        df.index = new_index

        # sort by max column
        df = df.sort_values(by=['max'], ascending=False)

        # delete max column
        df = df.drop(['max'], axis=1)

        # sort based on column totals
        df = df.reindex(sorted(df.columns), axis=1)

        # plot the heatmap
        ax = sns.heatmap(df,
                         cmap="Blues",
                         annot=True,
                         vmin=0.,
                         vmax=10.,
                         square=True,
                         annot_kws={"size": 11},
                         fmt='.1f',
                         mask=df <= 0.0,
                         linewidths=.5,
                         cbar=False,
                         yticklabels=True)

        # adjust the figure somewhat
        ax.xaxis.tick_top()
        plt.yticks(rotation=0)
        plt.xticks(rotation=90, ha='left')
        fig = ax.get_figure()
        fig.set_size_inches(19, 6)

        # save figure
        fig.savefig(os.path.join(self.plot_save_folder, plot_save_name),
                    bbox_inches='tight')

    def plot_topics_in_journals(self, plot_save_name='topics-in-journals.pdf'):
        """
			Plot the distribution of topics within each of the journals in our dataset.
			This plot provides an overview of the topical content published by a journal given the time frame of our dataset

			Parameters
			----------
			plot_save_name: string
				name of the plot
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # create dictionary where we have key = journal, and value = [topic_distributions]
        journal_to_topics = {}

        # load documents from database
        D = self.db.read_collection(collection='publications')

        # loop over the documents, read in the topic distribution, and add to the correct journal key
        for i, d in enumerate(D):

            # verbose process every 1000th document
            if i % 1000 == 0:
                logging.debug('Processing document {}/{}'.format(i, D.count()))

            # get the name of the journal
            journal = d['journal']

            # check if topics are created
            if d.get('topics') is not None:

                # add journal as key to the dictionary if not already exists
                if journal not in journal_to_topics:

                    # add journal as key with empty list
                    journal_to_topics[journal] = []

                # sort topics and create as list
                topics = [
                    value for key, value in sorted(d['topics'].iteritems(),
                                                   key=lambda x: int(x[0]))
                ]

                # append topic distribution to dictionary
                journal_to_topics[journal].append(topics)

        # get cumulative topic distributions for each journa
        journal_to_cum_topics = get_journal_to_cum_topics(journal_to_topics)

        # convert to Pandas DataFrame
        df = pd.DataFrame.from_dict(journal_to_cum_topics).T

        # change column labels to topic labels
        df.columns = [get_topic_label(x) for x in df.columns.values]

        # plot the heatmap
        ax = sns.heatmap(df,
                         cmap="Blues",
                         annot=True,
                         vmin=0.,
                         vmax=.3,
                         square=True,
                         annot_kws={"size": 11},
                         fmt='.2f',
                         mask=df <= 0.0,
                         linewidths=.5,
                         cbar=False,
                         yticklabels=True)

        # adjust the figure somewhat
        ax.xaxis.tick_top()
        plt.yticks(rotation=0)
        plt.xticks(rotation=90, ha='left')
        fig = ax.get_figure()
        fig.set_size_inches(10, 10)

        # save figure
        fig.savefig(os.path.join(self.plot_save_folder, plot_save_name),
                    bbox_inches='tight')

        # close thee plot
        plt.close()
import pathlib



"""
	Script starts here
"""
if __name__ == "__main__":

	# create logging to console
	set_logger()

	logging.info('Start: {} '.format(__file__))

	# create database connection
	db = MongoDatabase()

	# location of target tweets
	location_tweets = os.path.join('files', 'target_tweets_UNCW')

	# modes of research
	modes_of_research = ['UNCW']

	# process tweets for each mode of research
	for mode in modes_of_research:

		logging.info('Processing mode of research: {}'.format(mode))

		# read tweets files
		F = read_directory(os.path.join(location_tweets, mode))
		print(F)
    tweet_id_to_label = {l[0]: l[1] for l in labels}

    # define the label types
    label_types = ['positive', 'neutral', 'negative']

    # define the colors for each sentiment (postive, neutral and negative)
    colors = ['#52bf80', '#088bdc', '#fe6300']

    # plot location
    plot_location = os.path.join('files', 'plots')

    # create location if not exists
    create_directory(plot_location)

    # create database connection
    db = MongoDatabase()

    if create_donot_plot:
        """
			Create a donutplot for interdisciplinary, transdisciplinary and multidisciplinary target tweets where each
			donutplot shows the percentage of positive, negative and neutral tweets

			file will be saved to files/plots/donutplot.pdf
		
		"""

        # create the subplts
        fig, axs = plt.subplots(1, 3, figsize=(21, 6))
        # make axes available like ax[0] instead of ax[0,0]
        axs = axs.ravel()
Exemple #19
0
import json
from helper_functions import *
from database import MongoDatabase
"""
	Script starts here
"""

if __name__ == "__main__":

    # create logging to console
    set_logger()

    logging.info('Start: {} '.format(__file__))

    # create database connection
    db = MongoDatabase()

    # name of collection to store all the training tweets to
    db_collection = 'training_tweets'

    # setup spacy object, so we can do some NLP things
    nlp = setup_spacy()

    # sources to process and the collections they are stored in
    process_sources = {
        'sanders': 'sanders_tweets_raw',
        'semeval': 'semeval_tweets_raw',
        'clarin13': 'clarin13_tweets_raw',
        'hcr': 'hcr_tweets_raw',
        'omd': 'omd_tweets_raw',
        'stanford': 'stanford_tweets_raw',
from database import MongoDatabase
from sklearn.externals import joblib
"""
	Script starts here
"""

if __name__ == "__main__":

    # create logging to console
    set_logger()

    # verbose
    logging.info('Start: {} '.format(__file__))

    # create database connection
    db = MongoDatabase()

    # load classifier
    clf = joblib.load(os.path.join('files', 'ml_models', 'LinearSVC.pkl'))

    # read labels for target tweets that have been manually labeled and convert to dictionary with key = tweet ID and value = label
    true_labels = {
        d['tweet_id']: d['label']
        for d in db.read_collection(collection='manual_tweets_raw')
    }

    # load tweets for which we want to infer the sentiment label
    D = db.read_collection(collection='target_tweets')

    # create empty numpy array so we can retrieve labels later on somewhat faster
    labels = np.zeros((D.count(), 3), dtype=np.int)
Exemple #21
0
class Evaluation():
    def __init__(self):

        logging.info('Initialized {}'.format(self.__class__.__name__))

        # instantiate database
        self.db = MongoDatabase()

    def calculate_coherence(self,
                            file_folder=os.path.join('files', 'lda'),
                            models_folder=os.path.join('files', 'models')):
        """
			Calculate the CV coherence score for each of the created LDA models

			Parameters
			----------
			file_folder: os.path
				location of the dictionary and corpus for gensim
			models_folder: os.path
				location where the lda model is saved
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # read dictionary and corpus
        dictionary, corpus = get_dic_corpus(file_folder)

        # load bag of words features of each document from the database
        texts = [
            x['tokens'] for x in self.db.read_collection('publications_raw')
        ]

        # get path location for models
        M = [
            x for x in read_directory(models_folder) if x.endswith('lda.model')
        ]

        # read processed models from database
        processed_models = [
            '{}-{}-{}-{}-{}'.format(x['k'], x['dir_prior'], x['random_state'],
                                    x['num_pass'], x['iteration'])
            for x in self.db.read_collection('coherence')
        ]

        # calculate coherence score for each model
        for i, m in enumerate(M):

            logging.info('Calculating coherence score: {}/{}'.format(
                i + 1, len(M)))

            print m

            # number of topics
            k = m.split(os.sep)[2]
            # different dirichlet priors
            dir_prior = m.split(os.sep)[3]
            # random initiatilizations
            random_state = m.split(os.sep)[4]
            # passes over the corpus
            num_pass = m.split(os.sep)[5]
            # max iteration for convergence
            iteration = m.split(os.sep)[6]

            logging.info(
                'k: {}, dir_prior: {}, random_state: {}, num_pass: {}, iteration: {}'
                .format(k, dir_prior, random_state, num_pass, iteration))

            # check if coherence score already obtained
            if '{}-{}-{}-{}-{}'.format(k, dir_prior, random_state, num_pass,
                                       iteration) not in processed_models:

                # load LDA model
                model = models.LdaModel.load(m)

                # get coherence c_v score
                coherence_c_v = CoherenceModel(model=model,
                                               texts=texts,
                                               dictionary=dictionary,
                                               coherence='c_v')

                # get coherence score
                score = coherence_c_v.get_coherence()

                # logging output
                logging.info('coherence score: {}'.format(score))

                # save score to database
                doc = {
                    'k': k,
                    'dir_prior': dir_prior,
                    'random_state': random_state,
                    'num_pass': num_pass,
                    'iteration': iteration,
                    'coherence_score': score
                }
                self.db.insert_one_to_collection('coherence', doc)

            else:
                logging.info(
                    'coherence score already calculated, skipping ...')
                continue

    def plot_coherence(self,
                       min_k=2,
                       max_k=20,
                       save_location=os.path.join('files', 'plots'),
                       plot_save_name='coherence_scores_heatmap.pdf'):
        """
			Read coherence scores from database and create heatmap to plot scores

			Parameters
			-----------
			min_k: int 
				owest number of topics created when creating LDA models. Here 2
			max_k: int
				highest number of topics created when creating LDA models. Here 20
			save_location: os.path
				location where to save the plot
			plot_save_name: string
				name for the plot
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # make sure plot save location exists
        create_directory(save_location)

        # read documents from database that contain coherence scores
        D = list(self.db.read_collection(collection='coherence'))

        # convert data from document into a list
        data = [[
            int(x['k']), x['dir_prior'], x['random_state'], x['num_pass'],
            x['iteration'], x['coherence_score']
        ] for x in D]

        # create empty dataframe where we can store our scores
        df = pd.DataFrame()

        # loop trough values of k parameter and find relevant scores for each grid search combination
        for k in range(min_k, max_k + 1):

            # create dataframe to temporarily store values
            df_temp = pd.DataFrame(index=[k])

            # loop trough the data to obtain only the scores for a specific k value
            for row in sorted(data):
                if row[0] == k:
                    df_temp['{}-{}-{}-{}'.format(
                        row[1], row[2], row[3], row[4])] = pd.Series(row[5],
                                                                     index=[k])

            # append temporarary dataframe of only 1 k value to the full dataframe
            df = df.append(df_temp)

        # transpose the dataframe
        df = df.transpose()

        # plot the heatmap
        ax = sns.heatmap(df,
                         cmap="Blues",
                         annot=True,
                         vmin=0.500,
                         vmax=0.530,
                         square=True,
                         annot_kws={"size": 11},
                         fmt='.3f',
                         linewidths=.5,
                         cbar_kws={'label': 'coherence score'})

        # adjust the figure somewhat
        ax.xaxis.tick_top()
        plt.yticks(rotation=0)
        plt.xticks(rotation=0, ha='left')
        fig = ax.get_figure()
        fig.set_size_inches(19, 6)

        # save figure
        fig.savefig(os.path.join(save_location, plot_save_name),
                    bbox_inches='tight')

    def output_lda_topics(self,
                          K=9,
                          dir_prior='auto',
                          random_state=42,
                          num_pass=15,
                          iteration=200,
                          top_n_words=10,
                          models_folder=os.path.join('files', 'models'),
                          save_folder=os.path.join('files', 'tables')):
        """
			Create table with LDA topic words and probabilities
			Creates a table of topic words and probabilties + topics in a list format
			
			Values for K, dir_prior, random_state, num_pass and iteratrion will become visible when plotting the coherence score. Use the model that 
			achieved the highest coherence score and plug in the correct values. The values will create the correct file location of the LDA model
			for example : files/models/2/auto/42/5/200/lda.model

			Parameters
			-----------
			k: int
				number of topics that resulted in the best decomposition of the underlying corpora
			dir_prior: string
				dirichlet priors 'auto', 'symmetric', 'asymmetric'
			random_state: int
				seed value for random initialization
			num_pass: int
				number of passes over the full corpus
			iteration: int
				max iterations for convergence
			top_n_words: int
				only print out the top N high probability words
			models_folder: os.path
				location of created LDA models
			save_folder: os.path
				location to store the tables

		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # load LDA model according to parameters
        model = load_lda_model(
            os.path.join(models_folder, str(K), dir_prior, str(random_state),
                         str(num_pass), str(iteration)))

        # define empty lists so we can fill them with words
        topic_table, topic_list = [], []

        # loop trough all the topics found within K
        for k in range(K):

            # create topic header, e.g. (1) TOPIC X
            topic_table.append([
                '{}'.format(
                    get_topic_label(k, labels_available=False).upper())
            ])
            # add column for word and probability
            topic_table.append(["word", "prob."])

            list_string = ""
            topic_string = ""
            topic_string_list = []

            # get topic distribution for topic k and return only top-N words
            scores = model.print_topic(k, top_n_words).split("+")

            # loop trough each word and probability
            for score in scores:

                # extract score and trimm spaces
                score = score.strip()

                # split on *
                split_scores = score.split('*')

                # get percentage
                percentage = split_scores[0]
                # get word
                word = split_scores[1].strip('"')

                # add word and percentage to table
                topic_table.append(
                    [word.upper(), "" + percentage.replace("0.", ".")])

                # add word to list table
                list_string += word + ", "

            # add empty line for the table
            topic_table.append([""])
            # add topic words to list
            topic_list.append([str(k + 1), list_string.rstrip(", ")])

        # save to CSV
        save_csv(topic_list, 'topic-list', folder=save_folder)
        save_csv(topic_table, 'topic-table', folder=save_folder)
Exemple #22
0
	def exitProgram():
		print 'Ending Process!'
		MongoDatabase.terminate()
		exit()
Exemple #23
0
    def __init__(self):

        logging.info('Initialized {}'.format(self.__class__.__name__))

        # instantiate database
        self.db = MongoDatabase()
# switches, set to True what needs to be executed
filter_tweets = True
clean_tweets = True
"""
	Script starts here
"""

if __name__ == "__main__":

    # create logging to console
    set_logger()

    logging.info('Start: {} '.format(__file__))

    # create database connection
    db = MongoDatabase()

    # execute if set to True
    if filter_tweets:
        """	
			Filter raw target tweet
				- remove non-English tweets
				- remove retweet
				- remove tweets that do  not originate from an academic or scientist (by using bio text)

			raw tweets are stored in the collection 'raw_tweets'
			filtered tweets will be stored in the collectin 'filtered_tweets'
		"""

        # read tweets documents from database
        D = db.read_collection(collection='raw_tweets')
Exemple #25
0
class Frec(QMainWindow):
    def __init__(self):
        super().__init__()

        self.ui = mainwindow.Ui_Frec()
        self.ui.setupUi(self)
        self.db = MongoDatabase()

        self.init_frec()

    def init_frec(self):
        self.ui.btn_register.clicked.connect(self.save_new_member)
        self.ui.btn_delete.clicked.connect(self.delete_member)
        self.ui.btn_createDesktopEntry.clicked.connect(self.create_desktop_entry)
        self.ui.btn_clear.clicked.connect(self.clear_form)
        self.ui.btn_export.clicked.connect(self.export_cvs)
        self.ui.btn_import.clicked.connect(self.import_cvs)
        self.ui.btn_exportAsCVS.clicked.connect(self.export_cvs)

        self.ui.btn_connectDb.clicked.connect(self.connect_db)

        self.connect_db()
        self.show_member_at_tableWidget()

    def show_message(self, msg, timeout=3000):
        self.ui.statusBar.showMessage(msg, timeout)

    def connect_db(self):
        dbName = self.ui.lne_dbName.text()
        if self.db.connect(dbName):
            self.show_message("Database connection with '" + dbName + "' is successful.")
            self.ui.lbl_dbConnection.setText("Database : Connected '" + dbName + "'")
            self.ui.lbl_dbConnection.setStyleSheet('color: green')
        else:
            self.ui.lbl_dbConnection.setText("Database : Disconnected")
            self.ui.lbl_dbConnection.setStyleSheet('color: red')
            self.show_message("Database connection couldn't established!")
            return False

        return True

    # Save new member to database
    def save_new_member(self):
        new = {
            "firstname": self.ui.lne_firstName.text(),
            "surname": self.ui.lne_lastName.text(),
            "department": self.ui.comboBox_department.currentText(),
            "email": self.ui.lne_email.text(),
            "mobilecyp": self.ui.lne_mobileCyp.text(),
            "mobileother": self.ui.lne_mobileOther.text(),
        }

        if self.db.add_new_member(new['firstname'], new['surname'], new['email'], new['department'], new['mobilecyp'],
                               new['mobileother']) is False:
            return False

        self.add_member_to_tableWidget(new['firstname'], new['surname'], new['email'], new['department'],
                                       new['mobilecyp'], new['mobileother'])

    # show member on tableview
    def add_member_to_tableWidget(self, firstname, surname, email, department, mobilecyp, mobileother):
        rowPoint = self.ui.tableWidget.rowCount()
        self.table.insertRow(rowPoint)
        self.table.setItem(rowPoint, 0, QTableWidgetItem(firstname))
        self.table.setItem(rowPoint, 1, QTableWidgetItem(surname))
        self.table.setItem(rowPoint, 2, QTableWidgetItem(email))
        self.table.setItem(rowPoint, 3, QTableWidgetItem(department))
        self.table.setItem(rowPoint, 4, QTableWidgetItem(mobilecyp))
        self.table.setItem(rowPoint, 5, QTableWidgetItem(mobileother))

    def show_member_at_tableWidget(self):
        self.table = self.ui.tableWidget
        self.table.setRowCount(0)
        self.table.setColumnCount(6)
        self.table.setHorizontalHeaderLabels(
            ["Name", "Surname", "E-mail", "Department", "Mobile No Cyp", "Mobile No Other"])
        query = {}  # means no condition. so it will get everyone.
        result = self.db.query_result_multi("Member", query)
        if not result:
            return False
        for member in result:
            self.add_member_to_tableWidget(member["name"]["first"], member["name"]["last"], member["email"],
                                           member["department"], member["mobileNo"]["cyp"], member["mobileNo"]["other"])

    # Delete chosen member on tableView
    def delete_member(self):
        indexes = self.ui.tableWidget.selectedIndexes()
        for i in indexes:
            # Following line get the selected row's email address
            email_to_delete = str(self.ui.tableWidget.item(i.row(), 2).text())

            # Delete member who has the email
            self.db.delete_member_by_email(email_to_delete)

        self.show_member_at_tableWidget()

    # Clear inside the line edits in registration form
    def clear_form(self):
        self.ui.lne_email.setText("")
        self.ui.lne_firstName.setText("")
        self.ui.lne_lastName.setText("")
        self.ui.lne_mobileCyp.setText("")
        self.ui.lne_mobileOther.setText("")
        self.ui.comboBox_department.setCurrentIndex(0)

    # Export members in the database as csv
    def export_cvs(self):

        try:
            file = open("freg_export.cvs", "w+")
            csv_file = csv.writer(file)
            data = [self.ui.lne_firstName.text(), self.ui.lne_lastName.text()]
            csv_file.writerow(data)

        finally:
            file.close()

    # Import members from a csv file
    def import_cvs(self):
        pass  # TODO

    def create_desktop_entry(self):
        fd = open("~/local/share/applications/freg.desktop", "w")
        fd.write("[Desktop Entry]\n")
        fd.write("Version=1.0 \n")
        fd.write("Type=Application\n")
        fd.write("Name=Freg\n")
        fd.write("Exec=python3 ~/.faunus/frec/frec.py\n")
        fd.write("Icon=~/.faunus/frec/data/icon/appicon.png\n")
        fd.write("Comment=Small member registration system\n")
        fd.write("Terminal=false\n")
        fd.close()

    def arrange_for_cvs(self):
        dbb = self.db.query_result_multi("Member", {})
        arranged_str = ""

        for member in dbb:
            one = member["name"]["first"] + "," + member["name"]["last"] + "," + member["email"]
            arranged_str += one + "\n"

        return arranged_str
from database import MongoDatabase
from config import MoR


"""
	Script starts here
"""
if __name__ == "__main__":

    # create logging to console
    set_logger()

    logging.info('Start: {} '.format(__file__))

    # create database connection
    db = MongoDatabase()

    # location of target tweets
    location_tweets = os.path.join('files', 'target_tweets')

    # process tweets for each mode of research
    for mode in MoR:

        logging.info('Processing mode of research: {}'.format(mode))

        # read tweets files
        F = read_directory(os.path.join(location_tweets, mode))

        # tracker to keep track of processed tweet ids
        tweet_tracker = set(['{}{}'.format(x['tweet_type'], x['id'])
                            for x in db.read_collection(collection='raw_tweets')])
Exemple #27
0
import os
import database
from database import MongoDatabase
from config import PitConfig
from routes import PitRoutes

reload(sys)
sys.setdefaultencoding('utf8')

# ---------------------------------------------------------------------------- #
# Create Flask web application object.
pit_app = flask.Flask(__name__, static_url_path="")
pit_app.secret_key = "roflmao"

# Set up and verify database connection.
database = MongoDatabase(PitConfig)
database.connect()

# Set up API endpoints.
routes = PitRoutes(pit_app, database, PitConfig, flask.make_response,
                   flask.render_template)

if __name__ == "__main__":
    # Start application.
    debug = PitConfig['web']['debug'] == 'True'
    port = int(PitConfig['web']['port'])
    if debug:
        pit_app.run(debug=debug, host="0.0.0.0", port=port)
    else:
        context = ('fullchain.pem', 'privkey.pem')
        pit_app.run(debug=debug,
Exemple #28
0
class Core:
    def __init__(self):

        self.components = []
        self.database = MongoDatabase("localhost")

        self.load_components()

    def load_components(self):

        log.info("Loading components...")

        for record in self.database.get_all_records("components"):
            try:
                self.create_component(record, save=False)
            except ValueError:
                pass

    def open_command_tunnel(self, content):
        matches = commands.recognise_command(content)
        if len(matches) == 0:
            return None

        tunnel = commands.MessageTunnel()

        context = commands.CommandContext(matches[0], tunnel)
        context.initial_message = content
        context.core = self
        context.run_command()
        return tunnel

    def create_component(self, data, save=True):
        component = components.create_component(data, core=self)
        if save:
            component.save(self.database)
        self.components.append(component)

    def find_component_by_filter(self, record_filter):
        return self.find_components_by_filter(record_filter)[0]

    def find_components_by_filter(self, record_filter):
        out = []
        for record in self.database.find_all("components", record_filter):
            for component in self.components:
                if component.uuid == record["uuid"]:
                    out.append(component)
        return out

    def save_all(self):

        log.info("Saving everything...")

        for component in self.components:
            component.save(self.database)

    def cleanup_all(self):

        log.info("Cleaning everything...")

        for component in self.components:
            component.cleanup()
Exemple #29
0
###### Install Pillow, not PIL ######
from constants import Constants
import math
import glob, os
import Tkinter
from Tkinter import *
import ttk
from PIL import Image, ImageTk
import zipfile
import shutil
from database import MongoDatabase
from inner_folder.ui_class import UI

if __name__=='__main__':
	###### Begin Script ######
	###### Connection to Mongo DB ######
	MongoDatabase.initlialize()
	# process = MongoDatabase.get_mongo_process()

	ui = UI(MongoDatabase.db)
	ui.checkPath()
	ui.root.mainloop()
        return


"""
	Script starts here
"""

if __name__ == "__main__":

    # create logging to console
    set_logger()

    logging.info('Start: {} '.format(__file__))

    # create database connection
    db = MongoDatabase()

    # name of collection to store all the training tweets to
    db_collection = 'training_tweets'

    # location to save machine learning classification models to
    model_save_location = os.path.join('files', 'ml_models2')

    # get all the training tweet documents
    D = db.read_collection(collection=db_collection)

    # get values from list and assign to X and Y
    X, Y = zip(*[(x['text'], str(x['label'])) for x in D])

    # define pipeline options
    pipeline_setup = get_pipeline_setup()
Exemple #31
0
from database import MongoDatabase
import joblib
"""
	Script starts here
"""

if __name__ == "__main__":

    # create logging to console
    set_logger()

    # verbose
    logging.info('Start: {} '.format(__file__))

    # create database connection
    db = MongoDatabase()

    # load classifier
    clf = joblib.load(os.path.join('files', 'ml_models3', 'LinearSVC.pkl'))

    # read labels for target tweets that have been manually labeled and convert to dictionary with key = tweet ID and value = label
    true_labels = {
        d['tweet_id']: d['label']
        for d in db.read_collection(collection='sanders_tweets_raw')
    }

    # load tweets for which we want to infer the sentiment label
    D = db.read_collection(collection='target_tweets_UNCW')

    # create empty numpy array so we can retrieve labels later on somewhat faster
    labels = np.zeros((D.count(), 3), dtype=np.int64)