def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "words", file) for word in open(path).readlines(): yield word.strip()
def chunked(files = items, chunk_types=('NP',)): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "conll2000", file + ".txt") s = open(path).read() for sent in tokenize.blankline(s): yield tree.conll_chunk(sent, chunk_types)
def tagged(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "conll2000", file + ".txt") s = open(path).read() for sent in tokenize.blankline(s): yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ppattach", file) for line in open(path).readlines(): yield tuple(line.split())
def _read(files, conversion_function): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "brown", file) f = open(path).read() for sent in tokenize.blankline(f): yield conversion_function(sent)
def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ieer", file) for doc in open(path).read().split('</DOC>'): doc = doc.split('<DOC>') if len(doc) == 2: yield "<DOC>" + doc[1] + "</DOC>\n"
def bracket_parse(files=items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd") s = open(path).read() data = _parse(s) for sent in data: yield tree.bracket_parse(sent)
def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "state_union", file + ".txt") f = open(path) preamble = True text = f.read() for t in tokenize.wordpunct(text): yield t
def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "gutenberg", file + ".txt") f = open(path) preamble = True for line in f.readlines(): if not preamble: for t in tokenize.wordpunct(line): yield t if line[:5] == '*END*': preamble = False
def raw(files = 'cmudict'): """ @param files: One or more cmudict files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "cmudict", file) for line in open(path).readlines(): fields = line.strip().split(' ') yield (fields[0], int(fields[1]), tuple(fields[2:]))
def raw(files = 'rotokas'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{list(string)} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "shoebox", file + ".dic") f = open(path).read() for entry in tokenize.blankline(f): yield list(_parse_entry(entry))
def raw(files="english-kjv"): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "genesis", file + ".txt") s = open(path).read() for t in tokenize.whitespace(s): yield t
def _read(files, conversion_function): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ycoe/pos", file) f = open(path).read() rx_pattern = re.compile( r""" <.*>_CODE |\s.*_ID """, re.VERBOSE | re.UNICODE, ) mySents = tokenize.blankline(f) for sent in mySents: sent = re.sub(rx_pattern, "", sent) if sent != "": yield conversion_function(sent, sep="_")
def loadParadigm(self, p_filename ): """ Load the given paradigm (XML file) Attributes are stored in self.attributes Data are stored in self.data They can be accessed as follows: self.attributes['gender'] # list of genders self.data[6]['gender'] # gender for the sixth data object self.data[6]['content'] # content for the sixth data object """ from nltk_lite.corpora import get_basedir basedir = get_basedir() # Look for the file try_filename = os.path.join(get_basedir(), "paradigms", p_filename) try: f = open(try_filename) p_filename = try_filename except IOError: print "Cannot find file" return None f.close() # These variables will be set by this method self.attributes = {} # A new dictionary self.data = [] # A new list # XML admin: create Reader object, parse document reader = Sax2.Reader() doc = reader.fromStream(p_filename) # Cycle through the given attributes and add them to self.attributes # for <name> in <attributes> attributes = doc.getElementsByTagName('attributes')[0] for name in attributes.getElementsByTagName('name'): # Setup a list of attribute values tmp_list = [] # for each value under name, store in list for value in name.getElementsByTagName('value'): tmp_list.append(value.getAttribute('value')) # Store list of values in dictionary self.attributes[name.getAttribute('name')] = tmp_list # Cycle through data objects and add them to self.data # for <form> in <paradigm> forms = doc.getElementsByTagName('paradigm')[0] for form in forms.getElementsByTagName('form'): # Initialise a temporary dictionary tmp_dict = {} for value in form.getElementsByTagName('attribute'): tmp_dict[value.getAttribute('name')] = value.getAttribute('value') # Add the new dictionary to the data list self.data.append(tmp_dict) # Talk to the user print "Paradigm information successfully loaded from file:", p_filename # State the number and print out a list of attributes print " "*4 + str(len(self.attributes)) + " attributes imported:", for att in self.attributes: print att, print # State the number of paradigm objects imported print " "*4 + str(len(self.data)) + " paradigm objects imported." return
def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade): # allow any kind of bracketing for flexibility L_BRACKET = re.compile(r"[\(\[\{<]") R_BRACKET = re.compile(r"[\)\]\}>]") if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd") s = open(path).read() data = _parse(s) for s in data: bracket = 0 itmType = None stack = [tree.Tree(top_node, [])] inTag = [] for itm in list(tokenize.whitespace(s)): if L_BRACKET.match(itm[0]): bracket += 1 itm = itm[1:] matched = False if partial_match == True: for eachItm in chunk_types: if len(eachItm) <= len(itm) and eachItm == itm[: len(eachItm)]: matched = True if collapse_partials == True: itm = eachItm else: if chunk_types is not None and itm in chunk_types: matched = True if matched == True: # and inTag == 0: chunk = tree.Tree(itm, []) if cascade == True: stack.append(chunk) inTag += [bracket] else: if len(inTag) == 0: stack[-1].append(chunk) inTag += [bracket] itmType = itm if R_BRACKET.match(itm[-1]): tmpItm = split(itm, itm[-1]) if tmpItm != "": if len(inTag) > 0 and inTag[-1] <= bracket: # inTag <= bracket: if cascade == True: stack[-1].append((itmType, tmpItm[0])) else: stack[-1][-1].append((itmType, tmpItm[0])) else: if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] stack[-1].append((itmType, tmpItm[0])) inTag = [] + inTag[:-2] bracket -= len(tmpItm) - 1 while len(inTag) > 0 and bracket < inTag[-1]: if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] inTag = [] + inTag[:-2] yield stack