def getSample(path , langPercent): #problems = dr.problems(dr.dirproblems(path,r".*\.txt")) problems = dr.problems ( dr.dirproblems ( path ) ) data = {} for dirname, (files,unknow) in problems: data[dirname]={} percent = langPercent(dirname) docs = "" for file in files: docs = docs + file[1] count = dr.bow(docs) #Sample : Number of words to be obtained based on a percentage of the language sample = int(round(float(len(count[0]))*float(percent))) #Sample of words of all the total of docs selection= random.sample(count[0],sample) #We select only the selection in the "count list" data[dirname]['total'] = getSelection(count[0],selection) for file in files: namefile = file[0].split("/")[-1] count_file = dr.bow(file[1]) #We geet the same selection as the final, for every single count of the file data[dirname][namefile] = getSelection(count_file[0],selection) print data return data
_ignore.append(line.strip()) # Loading stopwords if exits stopwords=[] if os.path.exists(opts.stopwords): verbose('Loading stopwords: ',opts.stopwords) stopwords=docread.readstopwords(opts.stopwords) else: info('Stopwords file not found assuming, emtpy',opts.stopwords) # Loading main files ------------------------------------------------- # load problems or problem verbose('Loading files') problems=docread.problems( docread.dirproblems(dirname,known_pattern,unknown_pattern,_ignore, code=codes[opts.language][opts.genre])) # Loading answers file only for DEVELOPMENT OR TRAINNING MODE if opts.mode.startswith("train") or opts.mode.startswith("devel"): if opts.Answers: answers_file=opts.Answers else: answers_file="{0}/{1}".format(dirname,opts.answers) verbose('Loading answer file: {0}'.format(answers_file)) answers = docread.loadanswers(answers_file,_ignore, code=codes[opts.language][opts.genre]) # Checking for consistency if not len(problems) == len(answers): p.error("Not match for number of problems({0}) and \
def getFiles(path): p = dr.dirproblems(path) problems = {} for id , (ks,uks) in p : problems[id] = {'known':[ dr.readdoc(k) for k in ks ], 'unknown': [dr.readdoc(u) for u in uks]} return problems
try: out = open(opts.output) except: p.error('Output parameter could not been open: {0}'\ .format(opts.output)) # Loading ingnore if exists _ignore=[] if os.path.exists('.ignore'): verbose('Loading files to ignore frm: .ignore') with open('.ignore') as file: _ignore=file.read().readlines() # load problems or problem problems=docread.dirproblems(dirname,opts.known,opts.unknown,_ignore) # TRAINNING MODE if opts.mode.startswith("train"): # Loading answers file if not len(args)==2: p.error("Answers needed for train mode") verbose('Loading answer file: {0}'.format(args[1])) answers = docread.loadanswers(args[1]) # Checking for consistency if not len(problems) == len(answers): p.error("Not match for number of problems({0}) and \ answers({1})".format(len(problems),len(answers)))