Ejemplo n.º 1
0
    def create(
        xupath,
        file_paths,
        element_equality_fields,
        path_field_equality_components=None,
        path_field_equality_components_is_whitelist=None,
    ):
        # Instantiate the XUPath
        xupath_parse_trees = []
        grammar_library = GrammarLibrary()
        grammar_production = grammar_library.get_grammar(XUPathGrammar.XUPATH)
        matches = grammar_production.scanString(xupath)
        for p, s, e in matches:
            xupath_parse_trees.append(p)
        xupath_pt_node = xupath_parse_trees[0].path

        # Now do XUGrep
        xugrep = XUGrep()
        xugrep.corpus = Corpus.create_from_files(
            file_paths,
            element_equality_fields,
            path_field_equality_components,
            path_field_equality_components_is_whitelist,
        )
        xugrep.process_xupath(xupath_pt_node)
        return xugrep
Ejemplo n.º 2
0
 def create(text, language_name):
     pd_parse_tree = PythonDictionaryParseTree()
     grammar_library = GrammarLibrary()
     production_parser = grammar_library.get_grammar( language_name )
     parse_tree = production_parser.parseString( text )
     normalized_pt = grammar_library.normalize_parse_tree( language_name, parse_tree )
     pd_parse_tree.root = normalized_pt
     return pd_parse_tree
Ejemplo n.º 3
0
class CorpusElement():
    idx_path = None
    label_path = None
    language_name_path = None
    file_path = None
    text = None         # Optional place to cache text
    text_ranges = None   # The start and end bytes of a string
    grammar_library = None 
    element_equality_fields = None
    path_field_equality_components = None
    path_field_equality_components_is_whitelist = None

    IDX_PATH = "idx_path"
    LABEL_PATH = "label_path"
    LANGUAGE_NAME_PATH = "language_name_path"
    FILE_PATH = "file_path"
    TEXT = "text"
    TEXT_RANGES = "text_ranges"
    ELEMENT_EQUALITY_FIELDS = "element_equality_fields"
    PATH_FIELD_EQUALITY_COMPONENTS = "path_field_equality_components"
    PATH_FIELD_EQUALITY_COMPONENTS_IS_WHITELIST = "path_field_equality_components_is_whitelist"

    def __init__(self):
        self.idx_path = []
        self.label_path = []
        self.language_name_path = []
        self.file_path = []
        self.text = None
        self.text_ranges = []
        self.grammar_library = GrammarLibrary()
        self.element_equality_fields = None
        self.path_field_equality_components = None
        self.path_field_equality_components_is_whitelist = None

    # Factory method to Create a CorpusElement
    #
    # @param[in] idx     
    # @param[in] label
    # @param[in] language_name
    # @param[in] file_path
    # @param[in] element_equality_fields
    # @return an instantiated CorpusElement
    @staticmethod
    def create(idx_path, label_path, language_name_path, file_path, element_equality_fields):
        
        corpus_element = CorpusElement()

        corpus_element.idx_path = idx_path
        corpus_element.label_path = label_path
        corpus_element.language_name_path = language_name_path
        corpus_element.file_path = file_path
        corpus_element.element_equality_fields = element_equality_fields

        fp = open( file_path )
        text = fp.read()
        fp.close()
        text_range = [0, len( text )]        
        
        corpus_element.text = text
        corpus_element.text_ranges = [ text_range ]
        
        return corpus_element

    # Getters and setters 
    def get_field(self, field_name):
        result = None
        if self.IDX_PATH == field_name:
            result = self.get_idx_path()
        elif self.LABEL_PATH == field_name:
            result = self.get_label_path()
        elif self.LANGUAGE_NAME_PATH == field_name:
            result = self.get_language_name_path()
        elif self.FILE_PATH == field_name:
            result = self.get_file_path()
        elif self.TEXT == field_name:
            result = self.get_text()
        elif self.ELEMENT_EQUALITY_FIELDS == field_name:
            result = self.get_element_equality_fields()
        elif self.PATH_FIELD_EQUALITY_COMPONENTS == field_name:
            result = self.get_path_field_equality_components()
        elif self.PATH_FIELD_EQUALITY_COMPONENTS_IS_WHITELIST == field_name:
            result = self.get_path_field_equality_components_is_whitelist()
        else:
            raise ValueError("Unrecognized field name: " + repr(field_name))
        return result

    def set_field(self, field_name, field_value):
        if self.IDX_PATH == field_name:
            self.idx_path = field_value
        elif self.LABEL_PATH == field_name:
            self.label_path = field_value
        elif self.LANGUAGE_NAME_PATH == field_name:
            self.language_name_path = field_value
        elif self.FILE_PATH == field_name:
            self.file_path = field_value
        elif self.TEXT == field_name:
            self.text = field_value
        elif self.ELEMENT_EQUALITY_FIELDS == field_name:
            self.element_equality_fields = field_value
        elif self.PATH_FIELD_EQUALITY_COMPONENTS == field_name:
            self.path_field_equality_components = field_value
        elif self.PATH_FIELD_EQUALITY_COMPONENTS_IS_WHITELIST == field_name:
            self.path_field_equality_components_is_whitelist = field_value
        else:
            raise ValueError("Unrecognized field name: " + repr(field_name))

    def get_idx_path(self):
        return self.idx_path
    
    def get_label_path(self):
        return self.label_path

    def get_language_name_path(self):
        return self.language_name_path

    def get_file_path(self):
        return self.file_path

    def get_text_ranges(self):
        return self.text_ranges

    def get_text(self):
        return self.text
        
    ## Create a corpus element for all strings from this corpus
    #    element that belong to the given language name
    #
    #  @param[in] language_name The language that we want to extract
    #  @return a new corpus whose elements contain strings that belong to 
    #    the given language name.
    def parse(self, language_name ):
        new_corpus_elements = set()
        
        grammar_production = self.grammar_library.get_grammar( language_name )
        matches = grammar_production.scanString( self.text )

        match_idx = 0
        for match, s, e in matches:
            new_idx_path = list(self.idx_path)
            new_idx_path.append(None)
            new_label_path = list(self.label_path)
            grammar_instance = self.grammar_library.get_grammar_instance(language_name)
            new_label = grammar_instance.get_label_for_match(language_name, match, match_idx)

            new_label_path.append(new_label)
            new_language_name_path = list(self.language_name_path)
            new_language_name_path.append( language_name )
            new_file_path = self.file_path
            new_text_ranges = list(self.text_ranges)
            new_text_ranges.append( [ s, e ] )
            new_text = self.text[s:e].strip()
            new_element_equality_fields = self.element_equality_fields
            new_path_field_equality_components = self.path_field_equality_components
            new_path_field_equality_components_is_whitelist = self.path_field_equality_components_is_whitelist

            new_corpus_element = CorpusElement()
            new_corpus_element.idx_path = new_idx_path
            new_corpus_element.label_path = new_label_path
            new_corpus_element.language_name_path = new_language_name_path
            new_corpus_element.file_path = new_file_path 
            new_corpus_element.text_ranges = new_text_ranges
            new_corpus_element.text = new_text 
            new_corpus_element.element_equality_fields = new_element_equality_fields
            new_corpus_element.path_field_equality_components = new_path_field_equality_components
            new_corpus_element.path_field_equality_components_is_whitelist = new_path_field_equality_components_is_whitelist

            new_corpus_elements.add( new_corpus_element )
            match_idx = match_idx + 1

        return new_corpus_elements


    # In Python set elements must be hashable (implement __hash__())
    def __hash__(self):
        if self.element_equality_fields == None:
            raise Exception("Must invoke set_field on element_equality_fields to evaluate equality!")
        values = []
        for equality_field in self.element_equality_fields:
            value = self.get_field(equality_field)
            if None != self.path_field_equality_components and\
                    self.path_field_equality_components_is_whitelist:
                if equality_field in self.path_field_equality_components:
                    valid_idxs = self.path_field_equality_components[equality_field]
                    modified_value = []
                    for idx in valid_idxs:
                        modified_value.append( value[idx] )
                    values.append(str(modified_value))
                else:
                    values.append(str(value))
            elif None != self.path_field_equality_components and\
                    not self.path_field_equality_components_is_whitelist:
                if equality_field in self.path_field_equality_components:
                    invalid_idxs = self.path_field_equality_components[equality_field]
                    modified_value = []
                    assert isinstance(value, list)
                    for component_idx in range(0,len(value)):
                        if not component_idx in invalid_idxs:
                            modified_value.append( value[component_idx] )
                    values.append(str(modified_value))
                else:
                    values.append(str(value))
            else:
                values.append(str(value))
        # watch out for errors that stem from this
        hash = sum( map( lambda x: x.__hash__(), values ) )
        return hash

        #paths = [ idx_path, language_name_path ]

        #watch out for errors that stem from this!
        #paths = filter(lambda x:x!=None, paths)
        #path_hashes = []
        #for path in paths:
        #    path_hash = sum( map( lambda x: x.__hash__(), path ) )
        #    path_hashes.append( path_hash )
        #  label_path_hash = sum( map( lambda x: x.__hash__(), label_path[1:] ))
        #  result = sum( path_hashes ) + label_path_hash + text.__hash__()
        #   return result

    # We want to define some basic notions of equality so as to compare 
    #  elements in sets
    def __eq__(self, other):

        if self.path_field_equality_components != other.path_field_equality_components:
            raise RuntimeWarning("Path field equality components are not the same in elements!")
        if self.path_field_equality_components_is_whitelist != other.path_field_equality_components_is_whitelist:
            raise RuntimeWarning("Path field equality components are not both black|white lists!")
        if self.element_equality_fields != other.element_equality_fields:
            raise RuntimeWarning("Element equality fields are not equal in elements being compared!")

        for equality_field in self.element_equality_fields:
            self_value = self.get_field(equality_field)
            other_value = other.get_field(equality_field)

            if None != self.path_field_equality_components and\
                    self.path_field_equality_components_is_whitelist:
                if equality_field in self.path_field_equality_components:
                    valid_idxs = self.path_field_equality_components[equality_field]
                    modified_self_value = []
                    modified_other_value = []
                    for idx in valid_idxs:
                        modified_self_value.append( self_value[idx] )
                        modified_other_value.append( other_value[idx] )
                    if modified_self_value != modified_other_value:
                        return False
                else:
                    if self_value != other_value:
                        return False
            elif None != self.path_field_equality_components and\
                    not self.path_field_equality_components_is_whitelist:
                if equality_field in self.path_field_equality_components:
                    invalid_idxs = self.path_field_equality_components[equality_field]
                    modified_self_value = []
                    modified_other_value = []
                    assert isinstance(self_value, list)
                    assert isinstance(other_value, list)
                    assert len(self_value) == len(other_value)
                    for component_idx in range(0, len(self_value)):
                        if not component_idx in invalid_idxs:
                            modified_self_value.append( self_value[component_idx] )
                            modified_other_value.append( other_value[component_idx] )
                    if modified_self_value != modified_other_value:
                        return False
                else:
                    if self_value != other_value:
                        return False
            else:
                if self_value != other_value:
                    return False
        return True

    # if self.label_path[1:] != other.label_path[1:]:
    # return False
    #  elif self.language_name_path != other.language_name_path:
    #       return False
    #   elif self.text != other.text:
    #       return False
    #   else:
    #       return True

    # We want to print out the corpus element
    def __str__(self):
        result = [ ]
        result.append( "label path: " + ":".join(self.label_path) )
        result.append( "language name path " + ":".join(self.language_name_path) )
        return "\n".join(result)