def loadAllDialoguesFromFile(speakerTypeFilePath, folderPath): db = DBController() db.dropDB() ensuredIndex = False ADict, CDict, JDict, DotDict = {}, {}, {}, {} #load the speaker type csv file with open(speakerTypeFilePath, 'rU') as f: lines = csv.reader(f) for i, line in enumerate(lines): if i == 0: continue speakerName, speakerType, speakerId = line[14].strip(), line[15].strip().upper(), line[16].strip() if speakerType == TYPE_ANALYST: ADict[speakerName] = speakerId elif speakerType == TYPE_CEO: CDict[speakerName] = speakerId elif speakerType == TYPE_JOURNALIST: JDict[speakerName] = speakerId elif speakerType == TYPE_DOT: DotDict[speakerName] = speakerId else: print(speakerName, speakerType) for dirPath, dirNames, fileNames in os.walk(folderPath): print(dirPath) if os.path.split(dirPath)[-1].startswith('chunk'): for fileName in fileNames: try: if fileName.endswith('txt'): fileNameParts = [part.strip() for part in fileName.split('.txt')[0].split('_')] company, time = fileNameParts[0], fileNameParts[1] sessionType, sessionOrder, asker, answerer = fileNameParts[2], int(fileNameParts[3]), fileNameParts[4], fileNameParts[5] if fileNameParts[-1].endswith('default') or fileNameParts[-1].endswith('copy'): continue elif fileNameParts[-1][-1].isdigit() and not fileNameParts[-1][-2].isdigit(): speakerName = fileNameParts[-1][:-1].strip() speechOrder = int(fileNameParts[-1][-1:]) elif fileNameParts[-1][-1].isdigit() and fileNameParts[-1][-2].isdigit(): speakerName = fileNameParts[-1][:-2].strip() speechOrder = int(fileNameParts[-1][-2:]) else: continue conference = db.getConferenceByCompanyTime(company, time) if conference is None: conference = {'company' : company, 'time' : time} conference = db.insertConference(conference) session = db.getSessionByConferenceAndOrder(conference['_id'], sessionOrder) if session is None: session = {'conference' : conference['_id'], 'order' : speechOrder, 'type' : sessionType, 'asker' : asker, 'answerer' : answerer} session = db.insertSession(session) speech = db.getSpeechByConferenceIdAndSessionIdAndOrder(conference['_id'], session['_id'], speechOrder) if speech is None: if speakerName in ADict: speakerType, speakerId = TYPE_ANALYST, ADict[speakerName] elif speakerName in CDict: speakerType, speakerId = TYPE_CEO, CDict[speakerName] elif speakerName in JDict: speakerType, speakerId = TYPE_JOURNALIST, JDict[speakerName] elif speakerName in DotDict: speakerType, speakerId = TYPE_DOT, DotDict[speakerName] else: speakerType, speakerId = TYPE_DOT, '' print(fileName, speakerName) filePath = os.path.join(dirPath, fileName) with open(filePath, 'rU') as f: text = ' '.join(f.readlines()).strip() text = text.decode('ascii', 'ignore').encode('ascii', 'ignore') speech = {'conference' : conference['_id'], 'session' : session['_id'], 'order' : speechOrder, 'text' : text, 'speakerName' : speakerName, 'speakerType' : speakerType, 'speakerId' : speakerId} db.insertSpeech(speech) if not ensuredIndex: db.ensureIndex() ensuredIndex = True except Exception as e: print(fileName) print(e)