コード例 #1
0
#fix the hypothetical current year=curr_yr
for curr_year in range(first_year, last_year):
    #get all examples up to curr_yr(included)=pre_train_set
    #pre_train_set=curBlog.getPostsBetweeDates(str(first_year)+"-01-01",str(curr_year)+"-12-31")
    pre_train_set = curBlog.getPostsFromAYear(curr_year)
    pre_train_set_parse = flog.microCorpus(pre_train_set)
    #    print pre_train_set_parse.mcCorpus[len()].locfile
    #print pre_train_set_parse.getInstancesIds()
    wa = np.ndarray(shape=(len(pre_train_set)), dtype=float)
    if v:
        print "current_year:" + str(curr_year) + "-train_total:" + str(
            len(pre_train_set))
#get feature space from pre_train_set
#vectorize the pre_train_set in feature space
    featC, featTI, tiMat, cntMat = ct.getBagOfWords(
        ct.loadStopwords("stopwords.french.list"), pre_train_set_parse)
    if v:
        print "training matrix shape:" + str(tiMat.shape)
#process labels to multilabel format
    handLbl, labels_trn = ct.labelsToMultilabelF(
        pre_train_set_parse.getInstancesCategories())
    if v:
        print "objects with labels: " + str(
            labels_trn.shape) + " - number of categories: " + str(
                len(list(handLbl.classes_)))
#----removed from full experiment
#weight=1
#    weight=1
#    posW=0
##for each year since curr_year until the first_year
#    for year in list(reversed(range(first_year,curr_year+1))):
コード例 #2
0
if v:
    print "///////////////////////////////////////////////////////////"
#for each year 
#fix the hypothetical current year=curr_yr
for curr_year in range(first_year,last_year):
#get all examples up to curr_yr(included)=pre_train_set
    pre_train_set=curBlog.getPostsBetweeDates(str(first_year)+"-01-01",str(curr_year)+"-12-31")
    pre_train_set_parse=flog.microCorpus(pre_train_set)
#    print pre_train_set_parse.mcCorpus[len()].locfile
    #print pre_train_set_parse.getInstancesIds()
    wa=np.ndarray(shape=(len(pre_train_set)), dtype=float)
    if v:
        print "current_year:"+str(curr_year)+"-train_total:"+str(len(pre_train_set))
#get feature space from pre_train_set
#vectorize the pre_train_set in feature space
    featC,featTI,tiMat,cntMat=ct.getBagOfWords(ct.loadStopwords("stopwords.french.list"),pre_train_set_parse)
    if v:
         print "training matrix shape:"+str(tiMat.shape)
#process labels to multilabel format
    handLbl,labels_trn=ct.labelsToMultilabelF(pre_train_set_parse.getInstancesCategories())
    if v:
        print "objects with labels: "+str(labels_trn.shape)+" - number of categories: "+str(len(list(handLbl.classes_)))
#weight=1
    weight=1
    posW=0
#for each year since curr_year until the first_year
    for year in list(reversed(range(first_year,curr_year+1))):
#	train_set+=weight samples in pre_train_set which belong to year (sample * weight)
        train_setDocs=curBlog.getPostsFromAYear(year)
        if v:
            print "year:"+str(year)+"-train:"+str(len(train_setDocs))+" w:"+str(weight)