class DataExporterMaster(): def __init__(self): self._resultQueue = Queue() self._taskQueue = Queue() self._db = DBController() self._threadNumber = 1 self._threadList = [] def exportAllCitationBlock(self): #single thread is enough attributeList = ['cotic', 'coname', 'filePath', 'accessNo', 'date', 'source', 'byline', 'byline_cleaned', 'headline', 'sentence', 'cite_content', 'cite_word', 'actor', 'organization', 'engager', 'FCEO', 'broker'] attributeList += ['total_word_count', 'cau_int', 'cau_ext', 'cont_l', 'cont_h', 'pos', 'neg', 'uncert'] attributeList += ['cau_int_words', 'cau_ext_words', 'cont_l_words', 'cont_h_words', 'pos_words', 'neg_words', 'uncert_words'] #Comment this line if you wanna continue last time work and set the write mode to append 'a' self._db.setAllArticleUnprocessed() writer = CSVWriterThread(self._resultQueue, 'export/allCitationSentence.csv', attributeList, mode='w') writer.start() #must set to 100, otherwise there's bug batchSize = 100 for i in range(self._threadNumber): t = DataProcessorThread(self._taskQueue, self._resultQueue) t._executeFunction = t.processCitationBlock t.start() self._threadList.append(t) while True: isDone = False for i in range(self._threadNumber): articleBatch = list(self._db.getUnprocessedArticleInBatch(batchSize)) if articleBatch is None or not articleBatch: isDone = True break self._taskQueue.put(articleBatch) self._taskQueue.join() print('################') if isDone: break for i in range(self._threadNumber): self._taskQueue.put(END_OF_QUEUE) self._taskQueue.join() for t in self._threadList: t.join() self._resultQueue.put(END_OF_QUEUE) self._resultQueue.join() writer.join() def exportKeywordSearch(self, searchString): self._threadNumber = 4 attributeList = ['cotic', 'coname', 'filePath', 'accessNo', 'date', 'source', 'byline', 'headline', 'sentence'] writer = CSVWriterThread(self._resultQueue, 'export/keywordSearch.csv', attributeList) writer.start() for i in range(self._threadNumber): t = DataProcessorThread(self._taskQueue, self._resultQueue, searchString) t._executeFunction = t.processKeywordSearch t.start() self._threadList.append(t) articleListCursor = self._db.getAllArticleBySearchString(searchString) #it's cursor here!! for article in articleListCursor: self._taskQueue.put(article) for i in range(self._threadNumber): self._taskQueue.put(END_OF_QUEUE) for t in self._threadList: t.join() self._resultQueue.put(END_OF_QUEUE) writer.join()