Ejemplo n.º 1
0
 def checkArgs(self):
     if not CmdOptions.checkArgs(self):
         return False
     if self.getString('select_pos') != 'random' and self.getInt('nr_pos_bags') > 1:
         printError(self.__class__.__name__, "given select_pos=random, nr_pos_bags shall be 1")
         return False
     return True
Ejemplo n.º 2
0
 def checkArgs(self):
     if not CmdOptions.checkArgs(self):
         return False
     if self.getString('select_pos') != 'random' and self.getInt(
             'nr_pos_bags') > 1:
         printError(self.__class__.__name__,
                    "given select_pos=random, nr_pos_bags shall be 1")
         return False
     return True
Ejemplo n.º 3
0
 def precompute_annotator(self, concept):
     INFO = 'dataengine.%s.precompute_annotator'%self.__class__.__name__
     topn = 100
     NegativeEngine.precompute_annotator(self, concept)
     
     for subconcept in concept.split('-'):
         expandedTagSet = set([subconcept] + wn_expand(subconcept))
         try:
             datafile = os.path.join(ROOT_PATH, self.collection, 'SimilarityIndex', 'ngd', '%s.txt' % subconcept)
             rankedtags = readRankingResults(datafile)
             expandedTagSet = expandedTagSet.union(set([x[0] for x in rankedtags[:topn]]))
         except:
             printError(INFO, 'failed to load ranktag file for %s' % subconcept)
         self.annotator = self.annotator.union(expandedTagSet)
     printStatus(INFO, 'precomputing the virtual annotator for %s: %d tags' % (concept, len(self.annotator)))
Ejemplo n.º 4
0
    def precompute_annotator(self, concept):
        INFO = 'dataengine.%s.precompute_annotator' % self.__class__.__name__
        topn = 100
        NegativeEngine.precompute_annotator(self, concept)

        for subconcept in concept.split('-'):
            expandedTagSet = set([subconcept] + wn_expand(subconcept))
            try:
                datafile = os.path.join(ROOT_PATH, self.collection,
                                        'SimilarityIndex', 'ngd',
                                        '%s.txt' % subconcept)
                rankedtags = readRankingResults(datafile)
                expandedTagSet = expandedTagSet.union(
                    set([x[0] for x in rankedtags[:topn]]))
            except:
                printError(INFO,
                           'failed to load ranktag file for %s' % subconcept)
            self.annotator = self.annotator.union(expandedTagSet)
        printStatus(
            INFO, 'precomputing the virtual annotator for %s: %d tags' %
            (concept, len(self.annotator)))
Ejemplo n.º 5
0
def process(options, testCollection, trainCollection, annotationName,
            tagrelMethod, tagfeature):
    rootpath = options.rootpath
    overwrite = options.overwrite

    concepts = readConcepts(trainCollection, annotationName, rootpath)
    nr_of_concepts = len(concepts)
    mapping = dict(zip(concepts, range(nr_of_concepts)))

    feat_dir = os.path.join(rootpath, testCollection, 'FeatureData',
                            tagfeature)
    binary_file = os.path.join(feat_dir, 'feature.bin')
    id_file = os.path.join(feat_dir, 'id.txt')
    shape_file = os.path.join(feat_dir, 'shape.txt')

    if checkToSkip(binary_file, overwrite):
        sys.exit(0)

    inputfile = os.path.join(rootpath, testCollection, 'autotagging',
                             testCollection, trainCollection, tagrelMethod,
                             'id.tagvotes.txt')
    if not os.path.exists(inputfile):
        printError(INFO, '%s does not exist' % inputfile)
        sys.exit(0)

    makedirsforfile(binary_file)
    fw = open(binary_file, 'wb')
    processed = set()
    imset = []
    count_line = 0

    for line in open(inputfile):
        count_line += 1
        elems = str.split(line.strip())
        name = elems[0]

        if name in processed:
            continue
        processed.add(name)

        del elems[0]
        assert (len(elems) == 2 * nr_of_concepts)
        vec = [0] * nr_of_concepts

        for i in range(0, len(elems), 2):
            tag = elems[i]
            idx = mapping[tag]
            score = float(elems[i + 1])
            vec[idx] = score

        s = float(sum(vec))  # l_1 normalized
        vec = np.array([x / s for x in vec], dtype=np.float32)
        vec.tofile(fw)
        imset.append(name)

    fw.close()

    fw = open(id_file, 'w')
    fw.write(' '.join(imset))
    fw.close()

    fw = open(shape_file, 'w')
    fw.write('%d %d' % (len(imset), nr_of_concepts))
    fw.close()
    print('%d lines parsed, %d ids ->  %d unique ids' %
          (count_line, len(processed), len(imset)))
Ejemplo n.º 6
0
def process(options, testCollection, trainCollection, annotationName, tagrelMethod, tagfeature):
    rootpath = options.rootpath
    overwrite = options.overwrite

    concepts = readConcepts(trainCollection, annotationName, rootpath)
    nr_of_concepts = len(concepts)
    mapping = dict(zip(concepts,range(nr_of_concepts)))
    
    feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', tagfeature)
    binary_file = os.path.join(feat_dir, 'feature.bin')
    id_file = os.path.join(feat_dir, 'id.txt')
    shape_file = os.path.join(feat_dir,'shape.txt')

    if checkToSkip(binary_file, overwrite):
        sys.exit(0)

    inputfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, tagrelMethod, 'id.tagvotes.txt')
    if not os.path.exists(inputfile):
        printError(INFO, '%s does not exist' % inputfile)
        sys.exit(0)

    makedirsforfile(binary_file)
    fw = open(binary_file, 'wb')
    processed = set()
    imset = []
    count_line = 0

    for line in open(inputfile):
        count_line += 1
        elems = str.split(line.strip())
        name = elems[0]

        if name in processed:
            continue
        processed.add(name)

        del elems[0]
        assert(len(elems) == 2 * nr_of_concepts)
        vec = [0] * nr_of_concepts

        for i in range(0, len(elems), 2):
            tag = elems[i]
            idx = mapping[tag]
            score = float(elems[i+1])
            vec[idx] = score

        s = float(sum(vec)) # l_1 normalized
        vec = np.array([x/s for x in vec], dtype=np.float32)
        vec.tofile(fw)
        imset.append(name)

    fw.close()

    fw = open(id_file, 'w')
    fw.write(' '.join(imset))
    fw.close()

    fw = open(shape_file, 'w')
    fw.write('%d %d' % (len(imset), nr_of_concepts))
    fw.close()
    print ('%d lines parsed, %d ids ->  %d unique ids' % (count_line, len(processed), len(imset)))