def process(filepath): s = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' ' + filepath print >> sys.stderr, s #COUNT STEP freqlist_word = FrequencyList() freqlist_lemma = FrequencyList() freqlist_lemmapos = FrequencyList() try: for word in folia.Reader(filepath, folia.Word): try: freqlist_word.count(word.text()) except folia.NoSuchText: print >> sys.stderr, "ERROR: Got NoSuchText error on " + word.id + " !!!" continue try: if word.lemma(): freqlist_lemma.count(word.lemma()) if word.pos(): freqlist_lemmapos.count((word.lemma(), word.pos())) except folia.NoSuchAnnotation: print >> sys.stderr, "ERROR: Got NoSuchAnnotation error on " + word.id + " !!!" continue except Exception as e: print >> sys.stderr, "ERROR: Got exception counting " + filepath + ": ", repr( e) return filepath, freqlist_word, freqlist_lemma, freqlist_lemmapos
def train(self, sourcefile, targetfile): sourcefile = open(sourcefile) targetfile = open(targetfile) self.sourcefreqlist = FrequencyList(None, self.casesensitive) self.targetfreqlist = FrequencyList(None, self.casesensitive) #frequency lists self.source2target = {} self.target2source = {} for sourceline, targetline in zip(sourcefile, targetfile): sourcetokens = sourceline.split() targettokens = targetline.split() self.sourcefreqlist.append(sourcetokens) self.targetfreqlist.append(targettokens) for sourcetoken in sourcetokens: if not sourcetoken in self.source2target: self.source2target[sourcetoken] = FrequencyList( targettokens, self.casesensitive) else: self.source2target[sourcetoken].append(targettokens) for targettoken in targettokens: if not targettoken in self.target2source: self.target2source[targettoken] = FrequencyList( sourcetokens, self.casesensitive) else: self.target2source[targettoken].append(sourcetokens) sourcefile.close() targetfile.close()
def load(self, filename): self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) f = io.open(filename, 'r', encoding='utf-8') mode = False for line in f.readlines(): line = line.strip() if line: if not mode: if line != "[simplelanguagemodel]": raise Exception("File is not a SimpleLanguageModel") else: mode = 1 elif mode == 1: if line[:2] == 'n=': self.n = int(line[2:]) elif line[:12] == 'beginmarker=': self.beginmarker = line[12:] elif line[:10] == 'endmarker=': self.endmarker = line[10:] elif line[:10] == 'sentences=': self.sentences = int(line[10:]) elif line[:14] == 'casesensitive=': self.casesensitive = bool(int(line[14:])) self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList( None, self.casesensitive) elif line == "[freqlistN]": mode = 2 else: raise Exception( "Syntax error in language model file: ", line) elif mode == 2: if line == "[freqlistNm1]": mode = 3 else: try: type, count = line.split("\t") self.freqlistN.count(type.split(' '), int(count)) except: print( "Warning, could not parse line whilst loading frequency list: ", line, file=stderr) elif mode == 3: try: type, count = line.split("\t") self.freqlistNm1.count(type.split(' '), int(count)) except: print( "Warning, could not parse line whilst loading frequency list: ", line, file=stderr) if self.beginmarker: self._begingram = [self.beginmarker] * (self.n - 1) if self.endmarker: self._endgram = [self.endmarker] * (self.n - 1)
def test_freqlist_caseinsens(self): """Bigram Frequency List (case insensitive)""" global sentences f = FrequencyList(None, False) for sentence in sentences: f.append(Windower(sentence, 2)) self.assertTrue((f[('is', 'a')] == 2 and f[('this', 'is')] == 1))
def test_freqlist_typecount(self): """Frequency List (count types)""" global sentences f = FrequencyList() for sentence in sentences: f.append(sentence) self.assertEqual(len(f), 9)
def test_freqlist_tokencount(self): """Frequency List (count tokens)""" global sentences f = FrequencyList() for sentence in sentences: f.append(sentence) self.assertEqual(f.total, 13)
def process(filename): print >> sys.stderr, "Processing " + filename doc = folia.Document(file=filename) freqlist = FrequencyList() if settings.n == 1: for word in doc.words(): text = word.toktext() if settings.casesensitive: text = text.lower() freqlist.count(text) elif settings.sentencemarkers: for sentence in doc.sentences(): for ngram in Windower(sentence.words(), settings.n): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) else: for word in Windower(sentence.words(), settings.n, None, None): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) if settings.autooutput: if filename[-len(settings.extension) - 1:].lower() == '.' + settings.extension: outfilename = filename[:-len(settings.extension) - 1] + '.freqlist' else: outfilename += '.freqlist' freqlist.save(outfilename, True) return freqlist
def test_freqlist_caseinsens(self): """Frequency List (case insensitive)""" global sentences f = FrequencyList(None, False) for sentence in sentences: f.append(sentence) self.assertTrue((f['sentence'] == 2 and f['this'] == 2 and f['Test'] == 1))
def processdir(d, freqlist=None): if not freqlist: freqlist = FrequencyList() print("Searching in " + d, file=sys.stderr) for f in glob.glob(os.path.join(d, '*')): if f[-len(settings.extension) - 1:] == '.' + settings.extension: freqlist += process(f) elif settings.recurse and os.path.isdir(f): processdir(f, freqlist) return freqlist
def buildfromfolia(self, files, encoding='utf-8'): freqlist = FrequencyList() if isinstance(files, str): files = [files] for filename in files: f = folia.Document(file=filename) for sentence in f.sentences(): tokens = sentence.toktext().split(' ') freqlist.append(tokens) self.buildfromfreqlist(freqlist)
def buildfromtext(self, files, encoding='utf-8'): freqlist = FrequencyList() if isinstance(files, str): files = [files] for filename in files: with open(filename, 'r', encoding=encoding) as f: for line in f: tokens = line.strip().split() freqlist.append(tokens) self.buildfromfreqlist(freqlist)
def __init__(self, n=2, casesensitive=True, beginmarker="<begin>", endmarker="<end>"): self.casesensitive = casesensitive self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) assert isinstance(n, int) and n >= 2 self.n = n self.beginmarker = beginmarker self.endmarker = endmarker self.sentences = 0 if self.beginmarker: self._begingram = tuple([self.beginmarker] * (n - 1)) if self.endmarker: self._endgram = tuple([self.endmarker] * (n - 1))
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "o:OE:htspwrq", ["help"]) except getopt.GetoptError as err: print(str(err), file=sys.stderr) usage() sys.exit(2) outputfile = None for o, a in opts: if o == '-h' or o == '--help': usage() sys.exit(0) elif o == '-e': settings.encoding = a elif o == '-E': settings.extension = a elif o == '-o': outputfile = a elif o == '-O': settings.autooutput = True elif o == '-s': settings.sentencemarkers = True elif o == '-r': settings.recurse = True elif o == '-q': settings.ignoreerrors = True else: raise Exception("No such option: " + o) if outputfile: outputfile = io.open(outputfile, 'w', encoding=settings.encoding) if len(sys.argv) >= 2: freqlist = FrequencyList() for x in sys.argv[1:]: if os.path.isdir(x): processdir(x, freqlist) elif os.path.isfile(x): freqlist += process(x) else: print("ERROR: File or directory not found: " + x, file=sys.stderr) sys.exit(3) if outputfile: freqlist.save(outputfile, True) else: for line in freqlist.output("\t", True): print(line) else: print("ERROR: No files specified", file=sys.stderr) sys.exit(2)
def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err), file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr) print("Entropy: ", dist.entropy(), file=sys.stderr)
def process(filename): try: print("Processing " + filename, file=sys.stderr) doc = folia.Document(file=filename) freqlist = FrequencyList() if settings.n == 1: for word in doc.words(): text = word.toktext() if settings.casesensitive: text = text.lower() freqlist.count(text) elif settings.sentencemarkers: for sentence in doc.sentences(): for ngram in Windower(sentence.words(), settings.n): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) else: for word in Windower(sentence.words(), settings.n, None, None): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) if settings.autooutput: if filename[-len(settings.extension) - 1:].lower() == '.' + settings.extension: outfilename = filename[:-len(settings.extension) - 1] + '.freqlist' else: outfilename += '.freqlist' freqlist.save(outfilename, True) except Exception as e: if settings.ignoreerrors: print("ERROR: An exception was raised whilst processing " + filename, e, file=sys.stderr) else: raise return freqlist
for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
def process(data): global foliadir, indexlength, inputdir, outputdir curate = True filepath, args, kwargs = data outputfile = filepath.replace(inputdir, outputdir) if os.path.exists(outputfile): print >>sys.stderr, "Skipping curation of " + filepath + " (output file already exists)" curate = False s = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' ' + filepath print >>sys.stderr, s if curate: #CURATION STEP replace = [ ('<pos-annotation annotator="tadpole" annotatortype="auto" set="http://ilk.uvt.nl/folia/sets/cgn"/>', '<pos-annotation annotator="frog" annotatortype="auto" set="hdl:1839/00-SCHM-0000-0000-000B-9"/>'), ('<lemma-annotation annotator="tadpole" annotatortype="auto" set="http://ilk.uvt.nl/folia/sets/lemmas-nl"/>', '<lemma-annotation annotator="frog" annotatortype="auto" set="hdl:1839/00-SCHM-0000-0000-000E-3"/>'), ('<entity-annotation set="sonar-ner"/>','<entity-annotation annotator="NERD" annotatortype="auto" set="hdl:1839/00-SCHM-0000-0000-000D-5"/>') ] try: f = codecs.open(filepath,'r','utf-8') outputlines = [] gapinsertpoint = 0 hasgap = False metadata = 0 for i, line in enumerate(f): if metadata == 0 and line.find('<?xml version') != -1: outputlines.append(line) outputlines.append('<?xml-stylesheet type="text/xsl" href="sonar-foliaviewer.xsl"?>\n') continue if metadata == 0 and line.find('<metadata src=') != -1: metadata = 1 if metadata == 1: if line.find('</metadata>') != -1: gapinsertpoint = i -1 metadata = 2 else: for source,target in replace: line = line.replace(source,target) if metadata == 2: if not hasgap and line.find('<gap ') != -1: hasgap = True outputlines.append(line) f.close() if hasgap and gapinsertpoint > 0: outputlines.insert(gapinsertpoint, '<gap-annotation />\n') dir = os.path.dirname(outputfile) if not os.path.isdir(dir): os.mkdir( os.path.dirname(outputfile)) tmpfile = filepath.replace(inputdir, outputdir) + '.tmp' f = codecs.open(tmpfile,'w','utf-8') for line in outputlines: f.write(line) f.close() try: os.rename(tmpfile, outputfile ) except: print >>sys.stderr,"Unable to write file " + outputfile except Exception as e: print >>sys.stderr,"ERROR: Got exception curating " + filepath + ": ", repr(e) #COUNT STEP freqlist_word = FrequencyList() freqlist_lemma = FrequencyList() freqlist_lemmapos = FrequencyList() try: for word in folia.Reader(filepath, folia.Word): try: freqlist_word.count(word.text()) except folia.NoSuchText: print >>sys.stderr, "ERROR: Got NoSuchText error on " + word.id + " !!!" continue try: if word.lemma(): freqlist_lemma.count(word.lemma()) if word.pos(): freqlist_lemmapos.count( (word.lemma(), word.pos()) ) except folia.NoSuchAnnotation: print >>sys.stderr, "ERROR: Got NoSuchAnnotation error on " + word.id + " !!!" continue except Exception as e: print >>sys.stderr,"ERROR: Got exception counting " + filepath + ": ", repr(e) return filepath, freqlist_word, freqlist_lemma, freqlist_lemmapos
if __name__ == '__main__': try: inputdir = sys.argv[1] outputdir = sys.argv[2] threads = int(sys.argv[3]) except: print >>sys.stderr,"Syntax: sonar_postproc.py inputdir outputdir threads" sys.exit(2) cat_freqlist_word = FrequencyList() cat_freqlist_lemma = FrequencyList() cat_freqlist_lemmapos = FrequencyList() maxtasksperchild = 10 preindex = True prevcategory = None print >>sys.stderr,"Initialising (indexing)..." processor = folia.CorpusProcessor(inputdir, process, threads, 'folia.xml',"",lambda x: True, maxtasksperchild,preindex) print >>sys.stderr,"Processing..." for i, data in enumerate(processor): filepath, freqlist_word, freqlist_lemma, freqlist_lemmapos = data if filepath: category = None for e in filepath.split('/'): if e[-4:] != '.xml' and e[:3] == 'WR-' or e[:3] == 'WS-':
from __future__ import print_function, unicode_literals, division, absolute_import import sys import os if __name__ == "__main__": sys.path.append(sys.path[0] + '/../..') os.environ['PYTHONPATH'] = sys.path[0] + '/../..' from pynlpl.formats.sonar import CorpusFiles, Corpus from pynlpl.statistics import FrequencyList sonardir = sys.argv[1] freqlist = FrequencyList() lemmapos_freqlist = FrequencyList() poshead_freqlist = FrequencyList() pos_freqlist = FrequencyList() for i, doc in enumerate(Corpus(sonardir)): print("#" + str(i) + " Processing " + doc.filename,file=sys.stderr) for word, id, pos, lemma in doc: freqlist.count(word) if lemma and pos: poshead = pos.split('(')[0] lemmapos_freqlist.count(lemma+'.'+poshead) poshead_freqlist.count(poshead) pos_freqlist.count(pos) freqlist.save('sonarfreqlist.txt')
settings.extension = a elif o == '-o': outputfile = a elif o == '-O': settings.autooutput = True elif o == '-s': settings.sentencemarkers = True elif o == '-r': settings.recurse = True else: raise Exception("No such option: " + o) if outputfile: outputfile = codecs.open(outputfile, 'w', settings.encoding) if len(sys.argv) >= 2: freqlist = FrequencyList() for x in sys.argv[1:]: if os.path.isdir(x): processdir(x, freqlist) elif os.path.isfile(x): freqlist += process(x) else: print >> sys.stderr, "ERROR: File or directory not found: " + x sys.exit(3) if outputfile: freqlist.save(outputfile, True) else: for line in freqlist.output("\t", True): print line else: print >> sys.stderr, "ERROR: No files specified"