Esempio n. 1
0
    matchObj = re.match(r'(.*)\.pdf', filename)
    if matchObj:
        try:
            docname = matchObj.group(1)
            if docname+"_out.txt" not in os.listdir(data):
                print "Retrieving text of",docname
                pdf2txt.pdf_to_file(data+os.sep+filename, data+os.sep+docname+"_out.txt")
        except:
            print "Problem getting the name of the file \"",filename,"\"."


#Constructing TFIDFMatrixes for every concerned time period
periodFrequenciesList=[]
matrixList=[]
authorName=argv[1]
startDate=Timeline.formatDate(argv[2])
endDate=Timeline.formatDate(argv[3])
startDateTime=date(int(str(startDate).split()[0]),int(str(startDate).split()[1]),1) #conversion to objects of type dateTime
endDateTime=date(int(str(endDate).split()[0]),int(str(endDate).split()[1]),1)
periodLength=int(argv[4])
periodNumber2=monthdelta(startDateTime,endDateTime)//periodLength #number of periods considered
date1=startDateTime
date2=date1+ relativedelta(months=+periodLength)

for i in range(periodNumber2+1):  #create TFDIDF Matrixes for each period
    print i
    m=dataTimeline.createTFIDFMatrix(authorName, date1, date2, 
            variables.data_dir + os.sep + bibName) #TFIDF Matrix with all words/concepts.
#    tops = m.weights(number=5) #dictionary {concept:weight} for the top 5 five concepts, weight of best concept = 100, least = 1
    matrixList.append(m)
    date1=date2