Ejemplo n.º 1
0
def run():
    global metric
    global ranking
    print 'Loading RDG...',
    metric = Metric(rdgfolder, refcorpus)
    print 'done'
    print 'Ranking terms...',
    ranking = []
    ranking = metric.rankTerms(measure='Weighted')
    print 'done'
Ejemplo n.º 2
0
def __main__(args):
    """Input an RDG (and background), output a terms list"""
    logging.basicConfig(level=LEVEL)
    working_dir = '.'
    overwrite = True
    try:
        # First argument: RDG folder
        rdgfolder = args[1]
        # test if the folder is real
        if not os.path.isfile(rdgfolder):
            raise
        # Optional arguments:
        i = 2
        while i < len(args):
            # Optionally set measure
            if args[i] == '-m':
                measure = args[i + 1]
                # check that the measure is real
                if measure not in MEASURES:
                    raise
                i += 2
            # Optionally set testing file
            elif args[i] == '-t':
                testfile = args[i + 1]
                # check that the file is real
                if not os.path.isfile(testfile):
                    raise
                i += 2
            elif args[i] == '-d':
                working_dir = args[i + 1]
                # check that the file is real
                if not os.path.isdir(working_dir):
                    if os.path.isfile(working_dir):
                        raise
                    else:
                        os.mkdir(working_dir)
                i += 2
            elif args[i] == 'False':
                overwrite = False
            else:
                # Optionally set reference folder:
                reffolder = args[i]
                # test if the folder is real
                if not os.path.isfile(reffolder):
                    raise
                i += 1
        # default measure to 'Weighted'
        if 'measure' not in locals():
            measure = 'Weighted'
    except:
        # Remind the user what input is acceptable
        logging.error('Usage: ' + args[0] +
                      ' rdg_folder [ref_folder] [-m measure] [-t testfile]')
        logging.error('Measures: ' + str(MEASURES))
        exit(-1)
    # log parameters
    logging.info('RDG Folder: ' + rdgfolder)
    if 'reffolder' in locals():
        logging.info('Reference Folder: ' + reffolder)
    else:
        logging.info('Reference Folder: None')
    logging.info('Measure: ' + measure)
    if 'testfile' in locals():
        logging.info('Test Word List File: ' + testfile)
    else:
        logging.info('Test Word List File: None')
    # Set up the measurement class
    logging.debug('Loading Files...')
    if 'reffolder' in locals():
        metric = Metric(rdgfolder,
                        reffolder,
                        working_dir=working_dir,
                        overwrite=overwrite)  # reference files given
    else:
        raise
        # metric = Metric(rdgfolder, refcorpus) # reference corpus assumed
    logging.debug('done')
    # Get rankings
    logging.debug('Ranking terms...')
    if 'testfile' in locals():
        ranking = metric.rankWordList(testfile, measure)
    else:
        ranking = metric.rankTerms(measure)
    logging.debug('done')
    # Print rankings
    # for r in ranking:
    #    print r[0]+'\t'+str(r[1])
    try:
        for i in range(len(ranking)):
            if (len(ranking[i][0]) > MAX_LEN):
                continue
            sys.stdout.write(ranking[i][0] + '\t' + str(ranking[i][1]) + '\n')
    except IOError as e:
        if e.errno == errno.EPIPE:  # no longer printing to stdout
            return
Ejemplo n.º 3
0
def __main__(args):
    """Input an RDG (and background) and other parameters, output a terms list"""
    global rank_from_previous
    global background_cache_file
    logging.basicConfig(level=LEVEL)
    working_dir = '.'
    overwrite = True
    try:
        # First argument: RankFromPrevious uses a pkl file
        if args[1] == "RankFromPrevious":
            rank_from_previous = True
        # First argument: RDG folder
        rdgfolder = args[2]
        # test if the folder is real
        if not os.path.isfile(rdgfolder):
            raise
        outfile = args[3]
        background_cache_file = args[4]
        if background_cache_file.lower() == 'false':
            background_cache_file = 'ranking.pkl'
        if rank_from_previous and not (os.path.isfile(background_cache_file)):
            exception_string = background_cache_file + ' does not exist.\n'
            exception_string += 'Please rerun the system. If you choose the "rank from previous" option, \n'
            exception_string += 'you must choose an existing cached background file. When you rerun,'
            exception_string += 'you may not need to preprocess the foreground on the next run.'
            print(exception_string)
            raise Exception('Exiting')
        # Optional arguments:
        i = 5
        while i < len(args):
            # Optionally set measure
            if args[i] == '-m':
                measure = args[i + 1]
                # check that the measure is real
                if measure not in MEASURES:
                    raise
                i += 2
            # Optionally set testing file
            elif args[i] == '-t':
                testfile = args[i + 1]
                # check that the file is real
                if not os.path.isfile(testfile):
                    raise
                i += 2
            elif args[i] == '-d':
                working_dir = args[i + 1]
                # check that the file is real
                if not os.path.isdir(working_dir):
                    if os.path.isfile(working_dir):
                        raise
                    else:
                        os.mkdir(working_dir)
                i += 2
            elif args[i] == 'False':
                overwrite = False
            else:
                # Optionally set reference folder:
                reffolder = args[i]
                # test if the folder is real
                if (not os.path.isfile(reffolder)) and (
                        not rank_from_previous):
                    raise
                i += 1
        # default measure to 'Weighted'
        if 'measure' not in locals():
            measure = 'Weighted'
    except:
        # Remind the user what input is acceptable
        logging.error('Usage: ' + args[0] +
                      ' rdg_folder [ref_folder] [-m measure] [-t testfile]')
        logging.error('Measures: ' + str(MEASURES))
        exit(-1)
    # log parameters
    logging.info('RDG Folder: ' + rdgfolder)
    if ('reffolder' in locals()) and (not rank_from_previous):
        ## if 'reffolder' in locals():
        logging.info('Reference Folder: ' + reffolder)
    else:
        logging.info('Reference Folder: None')
    logging.info('Measure: ' + measure)
    if 'testfile' in locals():
        logging.info('Test Word List File: ' + testfile)
    else:
        logging.info('Test Word List File: None')
    # Set up the measurement class
    logging.debug('Loading Files...')
    metric = None
    #logging.error("LOCLS:" + str(locals()))
    try:
        if 'reffolder' in locals():
            ## if not rank_from_previous:
            metric = Metric(rdgfolder,
                            reffolder,
                            working_dir=working_dir,
                            overwrite=overwrite,
                            rank_from_previous=rank_from_previous,
                            background_cache_file=background_cache_file
                            )  # reference files given
            # Get rankings
            logging.debug('Ranking terms...')
            if 'testfile' in locals():
                ranking = metric.rankWordList(testfile, measure)
            else:
                if rank_from_previous:
                    ranking = metric.rankTermsFromPrevious(measure)
                else:
                    logging.debug('here')
                    ranking = metric.rankTerms(measure)

            try:
                with open(outfile, 'w') as outstream:
                    for i in range(len(ranking)):
                        if (len(ranking[i][0]) > MAX_LEN):
                            continue
                        sys.stdout.write(ranking[i][0] + '\t' +
                                         str(ranking[i][1]) + '\n')
                        outstream.write(ranking[i][0] + '\t' +
                                        str(ranking[i][1]) + '\n')
            except IOError as e:
                if e.errno == errno.EPIPE:  #no longer printing to stdout
                    return
        else:
            logging.error("Ref Folder Not In Locals, or exception in Metric")
    except:
        exc_info = sys.exc_info()
        raise str(exc_info[0]) + str(exc_info[1]) + str(exc_info[2])
Ejemplo n.º 4
0
def __main__(args):
    """Input an RDG (and background), output a terms list"""
    logging.basicConfig(level=LEVEL)
    working_dir = '.'
    try:
        # First argument: RDG folder
        rdgfolder = args[1]
        # test if the folder is real
        if not os.path.isfile(rdgfolder):
            raise
        # Optional arguments:
        i = 2
        while i < len(args):
            # Optionally set measure
            if args[i] == '-m':
                measure = args[i+1]
                # check that the measure is real
                if measure not in MEASURES:
                    raise
                i += 2
            # Optionally set testing file
            elif args[i] ==  '-t':
                testfile = args[i+1]
                # check that the file is real
                if not os.path.isfile(testfile):
                    raise
                i += 2
            elif args[i] ==  '-d':
                working_dir = args[i+1]
                # check that the file is real
                if not os.path.isdir(working_dir):
                    if os.path.isfile(working_dir):
                        raise
                    else:
                        os.mkdir(working_dir)
                i += 2
            else:
                # Optionally set reference folder:
                reffolder = args[i]
                # test if the folder is real
                if not os.path.isfile(reffolder):
                    raise
                i += 1
        # default measure to 'Weighted'
        if 'measure' not in locals():
            measure = 'Weighted'
    except:
        # Remind the user what input is acceptable
        logging.error('Usage: '+args[0]+' rdg_folder [ref_folder] [-m measure] [-t testfile]')
        logging.error('Measures: '+str(MEASURES))
        exit(-1)
    # log parameters
    logging.info('RDG Folder: '+rdgfolder)
    if 'reffolder' in locals():
        logging.info('Reference Folder: '+reffolder)
    else:
        logging.info('Reference Folder: None')
    logging.info('Measure: ' + measure)
    if 'testfile' in locals():
        logging.info('Test Word List File: ' + testfile)
    else:
        logging.info('Test Word List File: None')
    # Set up the measurement class
    logging.debug('Loading Files...')
    if 'reffolder' in locals():
        metric = Metric(rdgfolder, reffolder, working_dir) # reference files given
    else:
        raise
        #metric = Metric(rdgfolder, refcorpus) # reference corpus assumed
    logging.debug('done')
    # Get rankings
    logging.debug('Ranking terms...')
    if 'testfile' in locals():
        ranking = metric.rankWordList(testfile, measure)
    else:
        ranking = metric.rankTerms(measure)
    logging.debug('done')
    # Print rankings
    #for r in ranking:
    #    print r[0]+'\t'+str(r[1])
    try:
        for i in range(len(ranking)):
            if (len(ranking[i][0]) > MAX_LEN):
                continue
            sys.stdout.write(ranking[i][0]+'\t'+str(ranking[i][1])+'\n')
    except IOError as e:
        if e.errno == errno.EPIPE: #no longer printing to stdout
            return
Ejemplo n.º 5
0
os.chdir(oldpath)
print 'Getting stemming dictionary'
Filter._get_stemdict(os.path.join(genpath, 'filter.save'))
print 'Getting background'
metric.genDocs = Document()
genfiles = [genFile for genFile in os.listdir(genpath) if genFile[-4:]=='.txt']
for f in genfiles:
    d = Document(os.path.join(genpath, f))
    for w in d.counts:
        metric.genDocs.counts[w] += d.counts[w]
print 'Getting all RDG subfolders'
rdgdirs = [d for d in os.listdir(rdgpath) if os.path.isdir(os.path.join(rdgpath, d))]
for d in rdgdirs:
    print 'Computing metrics for '+d
    if os.path.exists(os.path.join(rdgpath, d, 'TFIDF.out')):
        print "Skip"
        continue
    metric.rdgDocs = []
    metric._TermFreq, metric._DR, metric._DC, metric._TFIDF = [None]*4
    del metric._TermFreq, metric._DR, metric._DC, metric._TFIDF
    rdgfiles = [rdgFile for rdgFile in os.listdir(os.path.join(rdgpath,d)) if rdgFile[-4:]=='.txt']
    for f in rdgfiles:
        metric.rdgDocs.append(Document(os.path.join(rdgpath,d,f)))
    for m in measures:
        ranking = metric.rankTerms(m)
        f = open(os.path.join(rdgpath,d,m+'.out'), 'w')
        for i in range(len(ranking)):
            f.write(ranking[i][0]+'\t'+str(ranking[i][1])+'\n')
        f.close()
print 'Done'