matchObj = re.match(r'(.*)\.pdf', filename) if matchObj: try: docname = matchObj.group(1) if docname+"_out.txt" not in os.listdir(data): print "Retrieving text of",docname pdf2txt.pdf_to_file(data+os.sep+filename, data+os.sep+docname+"_out.txt") except: print "Problem getting the name of the file \"",filename,"\"." #Constructing TFIDFMatrixes for every concerned time period periodFrequenciesList=[] matrixList=[] authorName=argv[1] startDate=Timeline.formatDate(argv[2]) endDate=Timeline.formatDate(argv[3]) startDateTime=date(int(str(startDate).split()[0]),int(str(startDate).split()[1]),1) #conversion to objects of type dateTime endDateTime=date(int(str(endDate).split()[0]),int(str(endDate).split()[1]),1) periodLength=int(argv[4]) periodNumber2=monthdelta(startDateTime,endDateTime)//periodLength #number of periods considered date1=startDateTime date2=date1+ relativedelta(months=+periodLength) for i in range(periodNumber2+1): #create TFDIDF Matrixes for each period print i m=dataTimeline.createTFIDFMatrix(authorName, date1, date2, variables.data_dir + os.sep + bibName) #TFIDF Matrix with all words/concepts. # tops = m.weights(number=5) #dictionary {concept:weight} for the top 5 five concepts, weight of best concept = 100, least = 1 matrixList.append(m) date1=date2