Esempi in Python per FrequencyList.FrequencyList, esempi in Python per pynlpl.statistics.FrequencyList.FrequencyList

Esempio n. 1

0

Mostra file

File: sonar_freqlist_single.py Progetto: LanguageMachines/SoNaR

def process(filepath):

    s = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' ' + filepath
    print >> sys.stderr, s

    #COUNT STEP
    freqlist_word = FrequencyList()
    freqlist_lemma = FrequencyList()
    freqlist_lemmapos = FrequencyList()

    try:
        for word in folia.Reader(filepath, folia.Word):
            try:
                freqlist_word.count(word.text())
            except folia.NoSuchText:
                print >> sys.stderr, "ERROR: Got NoSuchText error on " + word.id + " !!!"
                continue
            try:
                if word.lemma():
                    freqlist_lemma.count(word.lemma())
                    if word.pos():
                        freqlist_lemmapos.count((word.lemma(), word.pos()))
            except folia.NoSuchAnnotation:
                print >> sys.stderr, "ERROR: Got NoSuchAnnotation error on " + word.id + " !!!"
                continue
    except Exception as e:
        print >> sys.stderr, "ERROR: Got exception counting " + filepath + ": ", repr(
            e)
    return filepath, freqlist_word, freqlist_lemma, freqlist_lemmapos

Esempio n. 2

0

Mostra file

    def train(self, sourcefile, targetfile):
        sourcefile = open(sourcefile)
        targetfile = open(targetfile)

        self.sourcefreqlist = FrequencyList(None, self.casesensitive)
        self.targetfreqlist = FrequencyList(None, self.casesensitive)

        #frequency lists
        self.source2target = {}
        self.target2source = {}

        for sourceline, targetline in zip(sourcefile, targetfile):
            sourcetokens = sourceline.split()
            targettokens = targetline.split()

            self.sourcefreqlist.append(sourcetokens)
            self.targetfreqlist.append(targettokens)

            for sourcetoken in sourcetokens:
                if not sourcetoken in self.source2target:
                    self.source2target[sourcetoken] = FrequencyList(
                        targettokens, self.casesensitive)
                else:
                    self.source2target[sourcetoken].append(targettokens)

            for targettoken in targettokens:
                if not targettoken in self.target2source:
                    self.target2source[targettoken] = FrequencyList(
                        sourcetokens, self.casesensitive)
                else:
                    self.target2source[targettoken].append(sourcetokens)

        sourcefile.close()
        targetfile.close()

Esempio n. 3

0

Mostra file

File: lm.py Progetto: Zylophone/lingua_crowdsource

    def load(self, filename):
        self.freqlistN = FrequencyList(None, self.casesensitive)
        self.freqlistNm1 = FrequencyList(None, self.casesensitive)
        f = io.open(filename, 'r', encoding='utf-8')
        mode = False
        for line in f.readlines():
            line = line.strip()
            if line:
                if not mode:
                    if line != "[simplelanguagemodel]":
                        raise Exception("File is not a SimpleLanguageModel")
                    else:
                        mode = 1
                elif mode == 1:
                    if line[:2] == 'n=':
                        self.n = int(line[2:])
                    elif line[:12] == 'beginmarker=':
                        self.beginmarker = line[12:]
                    elif line[:10] == 'endmarker=':
                        self.endmarker = line[10:]
                    elif line[:10] == 'sentences=':
                        self.sentences = int(line[10:])
                    elif line[:14] == 'casesensitive=':
                        self.casesensitive = bool(int(line[14:]))
                        self.freqlistN = FrequencyList(None,
                                                       self.casesensitive)
                        self.freqlistNm1 = FrequencyList(
                            None, self.casesensitive)
                    elif line == "[freqlistN]":
                        mode = 2
                    else:
                        raise Exception(
                            "Syntax error in language model file: ", line)
                elif mode == 2:
                    if line == "[freqlistNm1]":
                        mode = 3
                    else:
                        try:
                            type, count = line.split("\t")
                            self.freqlistN.count(type.split(' '), int(count))
                        except:
                            print(
                                "Warning, could not parse line whilst loading frequency list: ",
                                line,
                                file=stderr)
                elif mode == 3:
                    try:
                        type, count = line.split("\t")
                        self.freqlistNm1.count(type.split(' '), int(count))
                    except:
                        print(
                            "Warning, could not parse line whilst loading frequency list: ",
                            line,
                            file=stderr)

        if self.beginmarker:
            self._begingram = [self.beginmarker] * (self.n - 1)
        if self.endmarker:
            self._endgram = [self.endmarker] * (self.n - 1)

Esempio n. 4

0

Mostra file

File: statistics.py Progetto: Zylophone/lingua_crowdsource

 def test_freqlist_caseinsens(self):
     """Bigram Frequency List (case insensitive)"""
     global sentences
     f = FrequencyList(None, False)
     for sentence in sentences:
         f.append(Windower(sentence, 2))
     self.assertTrue((f[('is', 'a')] == 2 and f[('this', 'is')] == 1))

Esempio n. 5

0

Mostra file

File: statistics.py Progetto: Zylophone/lingua_crowdsource

 def test_freqlist_typecount(self):
     """Frequency List (count types)"""
     global sentences
     f = FrequencyList()
     for sentence in sentences:
         f.append(sentence)
     self.assertEqual(len(f), 9)

Esempio n. 6

0

Mostra file

File: statistics.py Progetto: Zylophone/lingua_crowdsource

 def test_freqlist_tokencount(self):
     """Frequency List (count tokens)"""
     global sentences
     f = FrequencyList()
     for sentence in sentences:
         f.append(sentence)
     self.assertEqual(f.total, 13)

Esempio n. 7

0

Mostra file

def process(filename):
    print >> sys.stderr, "Processing " + filename
    doc = folia.Document(file=filename)

    freqlist = FrequencyList()

    if settings.n == 1:
        for word in doc.words():
            text = word.toktext()
            if settings.casesensitive: text = text.lower()
            freqlist.count(text)
    elif settings.sentencemarkers:
        for sentence in doc.sentences():
            for ngram in Windower(sentence.words(), settings.n):
                text = ' '.join([x for x in ngram.toktext()])
                if settings.casesensitive: text = text.lower()
                freqlist.count(text)
    else:
        for word in Windower(sentence.words(), settings.n, None, None):
            text = ' '.join([x for x in ngram.toktext()])
            if settings.casesensitive: text = text.lower()
            freqlist.count(text)

    if settings.autooutput:
        if filename[-len(settings.extension) -
                    1:].lower() == '.' + settings.extension:
            outfilename = filename[:-len(settings.extension) - 1] + '.freqlist'
        else:
            outfilename += '.freqlist'
        freqlist.save(outfilename, True)

    return freqlist

Esempio n. 8

0

Mostra file

File: statistics.py Progetto: Zylophone/lingua_crowdsource

 def test_freqlist_caseinsens(self):
     """Frequency List (case insensitive)"""
     global sentences
     f = FrequencyList(None, False)
     for sentence in sentences:
         f.append(sentence)
     self.assertTrue((f['sentence'] == 2 and f['this'] == 2
                      and f['Test'] == 1))

Esempio n. 9

0

Mostra file

File: foliafreqlist.py Progetto: MeTavi/folia

def processdir(d, freqlist=None):
    if not freqlist: freqlist = FrequencyList()
    print("Searching in  " + d, file=sys.stderr)
    for f in glob.glob(os.path.join(d, '*')):
        if f[-len(settings.extension) - 1:] == '.' + settings.extension:
            freqlist += process(f)
        elif settings.recurse and os.path.isdir(f):
            processdir(f, freqlist)
    return freqlist

Esempio n. 10

0

Mostra file

        def buildfromfolia(self, files, encoding='utf-8'):
            freqlist = FrequencyList()
            if isinstance(files, str): files = [files]
            for filename in files:
                f = folia.Document(file=filename)
                for sentence in f.sentences():
                    tokens = sentence.toktext().split(' ')
                    freqlist.append(tokens)

            self.buildfromfreqlist(freqlist)

Esempio n. 11

0

Mostra file

        def buildfromtext(self, files, encoding='utf-8'):
            freqlist = FrequencyList()
            if isinstance(files, str): files = [files]
            for filename in files:
                with open(filename, 'r', encoding=encoding) as f:
                    for line in f:
                        tokens = line.strip().split()
                        freqlist.append(tokens)

            self.buildfromfreqlist(freqlist)

Esempio n. 12

0

Mostra file

File: lm.py Progetto: Zylophone/lingua_crowdsource

    def __init__(self,
                 n=2,
                 casesensitive=True,
                 beginmarker="<begin>",
                 endmarker="<end>"):
        self.casesensitive = casesensitive
        self.freqlistN = FrequencyList(None, self.casesensitive)
        self.freqlistNm1 = FrequencyList(None, self.casesensitive)

        assert isinstance(n, int) and n >= 2
        self.n = n
        self.beginmarker = beginmarker
        self.endmarker = endmarker
        self.sentences = 0

        if self.beginmarker:
            self._begingram = tuple([self.beginmarker] * (n - 1))
        if self.endmarker:
            self._endgram = tuple([self.endmarker] * (n - 1))

Esempio n. 13

0

Mostra file

File: foliafreqlist.py Progetto: MeTavi/folia

def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:OE:htspwrq", ["help"])
    except getopt.GetoptError as err:
        print(str(err), file=sys.stderr)
        usage()
        sys.exit(2)

    outputfile = None

    for o, a in opts:
        if o == '-h' or o == '--help':
            usage()
            sys.exit(0)
        elif o == '-e':
            settings.encoding = a
        elif o == '-E':
            settings.extension = a
        elif o == '-o':
            outputfile = a
        elif o == '-O':
            settings.autooutput = True
        elif o == '-s':
            settings.sentencemarkers = True
        elif o == '-r':
            settings.recurse = True
        elif o == '-q':
            settings.ignoreerrors = True
        else:
            raise Exception("No such option: " + o)

    if outputfile:
        outputfile = io.open(outputfile, 'w', encoding=settings.encoding)

    if len(sys.argv) >= 2:
        freqlist = FrequencyList()
        for x in sys.argv[1:]:
            if os.path.isdir(x):
                processdir(x, freqlist)
            elif os.path.isfile(x):
                freqlist += process(x)
            else:
                print("ERROR: File or directory not found: " + x,
                      file=sys.stderr)
                sys.exit(3)
        if outputfile:
            freqlist.save(outputfile, True)
        else:
            for line in freqlist.output("\t", True):
                print(line)
    else:
        print("ERROR: No files specified", file=sys.stderr)
        sys.exit(2)

Esempio n. 14

0

Mostra file

def main():
    try:
        opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err), file=sys.stderr)
        usage()
        sys.exit(2)

    testsetsize = devsetsize = 0
    casesensitive = True
    encoding = 'utf-8'
    n = 1

    for o, a in opts:
        if o == "-n":
            n = int(a)
        elif o == "-i":
            casesensitive = False
        elif o == "-e":
            encoding = a
        else:
            print("ERROR: Unknown option:", o, file=sys.stderr)
            sys.exit(1)

    if not files:
        print >> sys.stderr, "No files specified"
        sys.exit(1)

    freqlist = FrequencyList(None, casesensitive)
    for filename in files:
        f = codecs.open(filename, 'r', encoding)
        for line in f:
            if n > 1:
                freqlist.append(Windower(crude_tokenizer(line), n))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type, tuple) or isinstance(type, list):
            type = " ".join(type)
        s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
            dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(), file=sys.stderr)
    print("Types:            ", len(freqlist), file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr)
    print("Entropy:          ", dist.entropy(), file=sys.stderr)

Esempio n. 15

0

Mostra file

File: foliafreqlist.py Progetto: MeTavi/folia

def process(filename):
    try:
        print("Processing " + filename, file=sys.stderr)
        doc = folia.Document(file=filename)

        freqlist = FrequencyList()

        if settings.n == 1:
            for word in doc.words():
                text = word.toktext()
                if settings.casesensitive: text = text.lower()
                freqlist.count(text)
        elif settings.sentencemarkers:
            for sentence in doc.sentences():
                for ngram in Windower(sentence.words(), settings.n):
                    text = ' '.join([x for x in ngram.toktext()])
                    if settings.casesensitive: text = text.lower()
                    freqlist.count(text)
        else:
            for word in Windower(sentence.words(), settings.n, None, None):
                text = ' '.join([x for x in ngram.toktext()])
                if settings.casesensitive: text = text.lower()
                freqlist.count(text)

        if settings.autooutput:
            if filename[-len(settings.extension) -
                        1:].lower() == '.' + settings.extension:
                outfilename = filename[:-len(settings.extension) -
                                       1] + '.freqlist'
            else:
                outfilename += '.freqlist'
            freqlist.save(outfilename, True)
    except Exception as e:
        if settings.ignoreerrors:
            print("ERROR: An exception was raised whilst processing " +
                  filename,
                  e,
                  file=sys.stderr)
        else:
            raise

    return freqlist

Esempio n. 16

0

Mostra file

for o, a in opts:
    if o == "-n":
        n = int(a)
    elif o == "-i":
        casesensitive = False
    elif o == "-e":
        encoding = a
    else:
        print("ERROR: Unknown option:", o, file=sys.stderr)
        sys.exit(1)

if not files:
    print >> sys.stderr, "No files specified"
    sys.exit(1)

freqlist = FrequencyList(None, casesensitive)
for filename in files:
    f = codecs.open(filename, 'r', encoding)
    for line in f:
        if n > 1:
            freqlist.append(Windower(crude_tokenizer(line), n))
        else:
            freqlist.append(crude_tokenizer(line))

    f.close()

dist = Distribution(freqlist)
for type, count in freqlist:
    if isinstance(type, tuple) or isinstance(type, list):
        type = " ".join(type)
    s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(

Esempio n. 17

0

Mostra file

File: sonar_postproc.py Progetto: LanguageMachines/SoNaR

def process(data):
    global foliadir, indexlength, inputdir, outputdir
    curate = True
    filepath, args, kwargs = data
    outputfile = filepath.replace(inputdir, outputdir)
    if os.path.exists(outputfile):
        print >>sys.stderr, "Skipping curation of " + filepath + " (output file already exists)"
        curate = False
            
    s =  datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' ' +  filepath     
    print >>sys.stderr, s



    if curate:
        #CURATION STEP
        replace = [
            ('<pos-annotation annotator="tadpole" annotatortype="auto" set="http://ilk.uvt.nl/folia/sets/cgn"/>', '<pos-annotation annotator="frog" annotatortype="auto" set="hdl:1839/00-SCHM-0000-0000-000B-9"/>'),
            ('<lemma-annotation annotator="tadpole" annotatortype="auto" set="http://ilk.uvt.nl/folia/sets/lemmas-nl"/>', '<lemma-annotation annotator="frog" annotatortype="auto" set="hdl:1839/00-SCHM-0000-0000-000E-3"/>'),        
            ('<entity-annotation set="sonar-ner"/>','<entity-annotation annotator="NERD" annotatortype="auto" set="hdl:1839/00-SCHM-0000-0000-000D-5"/>')
        ]
        
        
        try:
            f = codecs.open(filepath,'r','utf-8')
            outputlines = []
            gapinsertpoint = 0
            hasgap = False
            metadata = 0
            for i, line in enumerate(f):
                if metadata == 0 and line.find('<?xml version') != -1:
                    outputlines.append(line)
                    outputlines.append('<?xml-stylesheet type="text/xsl" href="sonar-foliaviewer.xsl"?>\n')
                    continue        
                if metadata == 0 and line.find('<metadata src=') != -1: 
                    metadata = 1
                if metadata == 1:
                    if line.find('</metadata>') != -1:
                        gapinsertpoint = i -1
                        metadata = 2
                    else:
                        for source,target in replace:                    
                            line = line.replace(source,target)
                if metadata == 2:      
                    if not hasgap and line.find('<gap ') != -1: 
                        hasgap = True


                outputlines.append(line)            
            f.close()
        
            if hasgap and gapinsertpoint > 0:        
                outputlines.insert(gapinsertpoint, '<gap-annotation />\n')
                
                

            dir = os.path.dirname(outputfile)
            if not os.path.isdir(dir):
                os.mkdir( os.path.dirname(outputfile))
            
            tmpfile = filepath.replace(inputdir, outputdir) + '.tmp'
            f = codecs.open(tmpfile,'w','utf-8')
            for line in outputlines:
                f.write(line)
            f.close()                
            
            try:
                os.rename(tmpfile, outputfile )
            except:
                print >>sys.stderr,"Unable to write file " + outputfile 

        except Exception as e:
            print >>sys.stderr,"ERROR: Got exception curating " + filepath + ": ", repr(e)

    
    #COUNT STEP    
    freqlist_word = FrequencyList()
    freqlist_lemma = FrequencyList()
    freqlist_lemmapos = FrequencyList()
    
    try:
        for word in folia.Reader(filepath, folia.Word):
            try:
                freqlist_word.count(word.text())
            except folia.NoSuchText:
                print >>sys.stderr, "ERROR: Got NoSuchText error on " + word.id + " !!!"
                continue
            try:
                if word.lemma():
                    freqlist_lemma.count(word.lemma())
                    if word.pos():
                        freqlist_lemmapos.count( (word.lemma(), word.pos()) )
            except folia.NoSuchAnnotation:
                print >>sys.stderr, "ERROR: Got NoSuchAnnotation error on " + word.id + " !!!"
                continue
    except Exception as e:
        print >>sys.stderr,"ERROR: Got exception counting " + filepath + ": ", repr(e)        
    return filepath, freqlist_word, freqlist_lemma, freqlist_lemmapos

Esempio n. 18

0

Mostra file

File: sonar_postproc.py Progetto: LanguageMachines/SoNaR

        
         
    
        

if __name__ == '__main__':    
    try:
        inputdir = sys.argv[1]        
        outputdir = sys.argv[2]
        threads = int(sys.argv[3])
    except:
        print >>sys.stderr,"Syntax: sonar_postproc.py inputdir outputdir threads"
        sys.exit(2)
    
    
    cat_freqlist_word = FrequencyList()
    cat_freqlist_lemma = FrequencyList()
    cat_freqlist_lemmapos = FrequencyList()

    maxtasksperchild = 10
    preindex = True
    prevcategory = None
    print >>sys.stderr,"Initialising (indexing)..."
    processor = folia.CorpusProcessor(inputdir, process, threads, 'folia.xml',"",lambda x: True, maxtasksperchild,preindex)
    print >>sys.stderr,"Processing..."
    for i, data in enumerate(processor):
        filepath, freqlist_word, freqlist_lemma, freqlist_lemmapos = data
        if filepath:
            category = None
            for e in filepath.split('/'):
                if e[-4:] != '.xml' and e[:3] == 'WR-' or e[:3] == 'WS-':

Esempio n. 19

0

Mostra file

File: sonarlemmafreqlist.py Progetto: Zylophone/lingua_crowdsource

from __future__ import print_function, unicode_literals, division, absolute_import

import sys
import os

if __name__ == "__main__":
    sys.path.append(sys.path[0] + '/../..')
    os.environ['PYTHONPATH'] = sys.path[0] + '/../..'

from pynlpl.formats.sonar import CorpusFiles, Corpus
from pynlpl.statistics import FrequencyList

sonardir = sys.argv[1]

freqlist = FrequencyList()
lemmapos_freqlist = FrequencyList()
poshead_freqlist = FrequencyList()
pos_freqlist = FrequencyList()

for i, doc in enumerate(Corpus(sonardir)):
    print("#" + str(i) + " Processing " + doc.filename,file=sys.stderr)
    for word, id, pos, lemma in doc:
        freqlist.count(word)
        if lemma and pos:
            poshead = pos.split('(')[0]
            lemmapos_freqlist.count(lemma+'.'+poshead)
            poshead_freqlist.count(poshead)
            pos_freqlist.count(pos)
      
freqlist.save('sonarfreqlist.txt')

Esempio n. 20

0

Mostra file

            settings.extension = a
        elif o == '-o':
            outputfile = a
        elif o == '-O':
            settings.autooutput = True
        elif o == '-s':
            settings.sentencemarkers = True
        elif o == '-r':
            settings.recurse = True
        else:
            raise Exception("No such option: " + o)

    if outputfile: outputfile = codecs.open(outputfile, 'w', settings.encoding)

    if len(sys.argv) >= 2:
        freqlist = FrequencyList()
        for x in sys.argv[1:]:
            if os.path.isdir(x):
                processdir(x, freqlist)
            elif os.path.isfile(x):
                freqlist += process(x)
            else:
                print >> sys.stderr, "ERROR: File or directory not found: " + x
                sys.exit(3)
        if outputfile:
            freqlist.save(outputfile, True)
        else:
            for line in freqlist.output("\t", True):
                print line
    else:
        print >> sys.stderr, "ERROR: No files specified"