Ejemplo n.º 1
0
    def parseDistribution(self, instance, start, end=None):
        dist = {}
        i = start + 1

        if not end:
            end = len(instance) - 1

        while i < end:  #instance[i] != "}":
            label = instance[i]
            try:
                score = float(instance[i + 1].rstrip(","))
                dist[label] = score
            except:
                print(
                    "ERROR: pynlpl.input.timbl.TimblOutput -- Could not fetch score for class '"
                    + label + "', expected float, but found '" +
                    instance[i + 1].rstrip(",") + "'. Instance= " +
                    " ".join(instance) + ".. Attempting to compensate...",
                    file=stderr)
                i = i - 1
            i += 2

        if not dist:
            print(
                "ERROR: pynlpl.input.timbl.TimblOutput --  Did not find class distribution for ",
                instance,
                file=stderr)

        return Distribution(dist)
Ejemplo n.º 2
0
    def append(self, word_id, senses,distance=0):
       # Commented by Ruben, there are some ID's that are repeated in all sonar test files...            
       #assert (not word_id in self.data)
       if isinstance(senses, Distribution):
            self.data[word_id] = ( (x,y) for x,y in senses ) #PATCH UNDONE (#TODO: this is a patch, something's not right in Distribution?)
            self.distances[word_id]=distance
            if distance > self.maxDistance:
              self.maxDistance=distance
            return
       else:
           assert isinstance(senses, list) and len(senses) >= 1

       self.distances[word_id]=distance
       if distance > self.maxDistance:
        self.maxDistance=distance
                             
       
       if len(senses[0]) == 1:
            #not a (sense_id, confidence) tuple! compute equal confidence for all elements automatically:
            confidence = 1 / float(len(senses))
            self.data[word_id]  = [ (x,confidence) for x in senses ]
       else: 
          fulldistr = True
          for sense, confidence in senses:
            if confidence == None:
                fulldistr = False
                break

          if fulldistr:
               self.data[word_id] = Distribution(senses)
          else:
               self.data[word_id] = senses
Ejemplo n.º 3
0
def main():
    try:
        opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err), file=sys.stderr)
        usage()
        sys.exit(2)

    testsetsize = devsetsize = 0
    casesensitive = True
    encoding = 'utf-8'
    n = 1

    for o, a in opts:
        if o == "-n":
            n = int(a)
        elif o == "-i":
            casesensitive = False
        elif o == "-e":
            encoding = a
        else:
            print("ERROR: Unknown option:", o, file=sys.stderr)
            sys.exit(1)

    if not files:
        print >> sys.stderr, "No files specified"
        sys.exit(1)

    freqlist = FrequencyList(None, casesensitive)
    for filename in files:
        f = codecs.open(filename, 'r', encoding)
        for line in f:
            if n > 1:
                freqlist.append(Windower(crude_tokenizer(line), n))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type, tuple) or isinstance(type, list):
            type = " ".join(type)
        s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
            dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(), file=sys.stderr)
    print("Types:            ", len(freqlist), file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr)
    print("Entropy:          ", dist.entropy(), file=sys.stderr)
Ejemplo n.º 4
0
        print("ERROR: Unknown option:", o, file=sys.stderr)
        sys.exit(1)

if not files:
    print >> sys.stderr, "No files specified"
    sys.exit(1)

freqlist = FrequencyList(None, casesensitive)
for filename in files:
    f = codecs.open(filename, 'r', encoding)
    for line in f:
        if n > 1:
            freqlist.append(Windower(crude_tokenizer(line), n))
        else:
            freqlist.append(crude_tokenizer(line))

    f.close()

dist = Distribution(freqlist)
for type, count in freqlist:
    if isinstance(type, tuple) or isinstance(type, list):
        type = " ".join(type)
    s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
        dist.information(type))
    print(s)

print("Tokens:           ", freqlist.tokens(), file=sys.stderr)
print("Types:            ", len(freqlist), file=sys.stderr)
print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr)
print("Entropy:          ", dist.entropy(), file=sys.stderr)