Esempio n. 1
0
    def segment_unit(self, unit, args):
        patterns = self.get_patterns()
        unit.patterns = []
        found_groups = {}
        
        rgx_matched = re.compile(ur'[_<>]')

        unit.match_conditions = True

        self.get_plain_content_from_unit(unit)
        
        for pattern in patterns:
            patternid = pattern['id']
            if not patternid: continue
            condition = self.get_condition(pattern)
            if condition == 'ignore': continue

            # get regex from pattern
            rgx = self.get_regex_from_pattern(patterns, patternid)
            hilited = patternid in self.options['hilite']
            
            def markup_segment(match):
                segment = match.group(0)
                if rgx_matched.search(segment, 1): return segment
                
                # mark it up
                span = '<span class="m ms">' if hilited else '<span class="m">'
                unit.patterns.append([patternid, segment])
                rep = ur'%s%s</span>' % (span, segment.replace(' ', '_'))

                # add variant
                if hilited and ('variants' in self.toreturn):
                    variant = self.get_variant_from_segment(segment)
                    self.variants[variant] = self.variants.get(variant, 0) + 1

                return rep

            # apply regex to unit
            if rgx:
                len_before = len(unit.plain_content)
                unit.plain_content, found = rgx.subn(markup_segment, unit.plain_content, 1)
                found = len(unit.plain_content) != len_before

                if (condition == 'include' and not found) or (condition == 'exclude' and found):
                    unit.match_conditions = False
                if found:
                    found_groups[re.sub(ur'-\d+$', '', pattern['key'])] = 1
                    dputils.inc_counter(self.stats['patterns'], pattern['id'], 1)
                else:
                    unit.patterns.append([patternid, ''])
        
        for group in found_groups:
            self.stats['groups'][group] += 1
            
        if found_groups:
            unit.plain_content = unit.plain_content.replace('_', ' ')
Esempio n. 2
0
def get_elementid_from_xml_element(element, idcount, as_string=False):
    ''' returns the elementid as a list
        e.g. [(u'', u'clause'), (u'type', u'disposition')]

        element: an xml element (etree)
        idcount: a dictionary, new for each enclosing text unit. Used to know
            which occurrence of an element we are seeing and generate a
            unique id. E.g. two same titles ('sheriff') marked up in the same
            way within the same entry => we need a count to differentiate
            them. We add [@o, 2] to second occurrence, etc.
    '''
    from django.utils.text import slugify

    element_text = utils.get_xml_element_text(element)

    # eg. parts: [(u'', u'clause'), (u'type', u'disposition')]
    parts = [(unicode(re.sub('data-dpt-?', '', k)), unicode(v))
             for k, v in element.attrib.iteritems() if k.startswith('data-dpt') and k not in ['data-dpt-cat']]

    # white list to filter the elements
    if parts[0][1] in ('clause', 'location', 'person'):
        element_text = slugify(u'%s' % element_text.lower())
        if len(element_text) > 0 and len(element_text) < 20:
            parts.append(['@text', element_text])
    else:
        parts = None

    if parts:
        order = dputils.inc_counter(idcount, repr(parts))
        if order > 1:
            # add (u'@o', u'2') if it is the 2nd occurence of this elementid
            parts.append((u'@o', u'%s' % order))

    return parts
Esempio n. 3
0
def get_elementid_from_xml_element(element, idcount, as_string=False):
    ''' returns the elementid as a list
        e.g. [(u'', u'clause'), (u'type', u'disposition')]

        element: an xml element (etree)
        idcount: a dictionary, new for each enclosing text unit. Used to know
            which occurrence of an element we are seeing and generate a
            unique id. E.g. two same titles ('sheriff') marked up in the same
            way within the same entry => we need a count to differentiate
            them. We add [@o, 2] to second occurrence, etc.
    '''
    from django.utils.text import slugify

    element_text = utils.get_xml_element_text(element)

    # eg. parts: [(u'', u'clause'), (u'type', u'disposition')]
    parts = [(unicode(re.sub('data-dpt-?', '', k)), unicode(v))
             for k, v in element.attrib.iteritems()
             if k.startswith('data-dpt') and k not in ['data-dpt-cat']]

    # white list to filter the elements
    if parts[0][1] in ('clause', 'location', 'person'):
        element_text = slugify(u'%s' % element_text.lower())
        if len(element_text) > 0 and len(element_text) < 20:
            parts.append(['@text', element_text])
    else:
        parts = None

    if parts:
        order = dputils.inc_counter(idcount, repr(parts))
        if order > 1:
            # add (u'@o', u'2') if it is the 2nd occurence of this elementid
            parts.append((u'@o', u'%s' % order))

    return parts
Esempio n. 4
0
    def draw_internal(self):
        self.context['canvas'] = {'width': 500, 'height': 500}

        self.drawing = {
            'points': [],
            'x': [],
            'y': [],
            'bar_height': self.bar_height,
            'font_size': self.font_size,
            'label_margin': self.margin
        }

        points = self.drawing['points']

        self.drawing['colors'] = [
            query.get_color() for query in self.queries.get_queries()
        ]
        self.drawing['summaries'] = [
            query.get_summary() for query in self.queries.get_queries()
        ]

        # {'agreement': [10, 20]}

        self.init_bands()

        # process all records
        # for record in self.get_all_conflated_ids():
        cat_hit = [0] * len(self.queries.get_queries())
        points_order = sorted(self.points.keys(),
                              key=lambda cid: self.points[cid][0][0])
        # for point in self.points.values():
        for cid in points_order:
            point = self.points[cid]
            found = any(point[2])
            x = point[0]
            ys = point[1]

            # update the min / max x
            if x[0] is not None and x[0] not in MAX_DATE_RANGE and (
                    self.mins[0] is None or self.mins[0] > x[0]):
                self.mins[0] = x[0]
            if x[1] is not None and x[1] not in MAX_DATE_RANGE and (
                    self.maxs[0] is None or self.maxs[0] < x[1]):
                self.maxs[0] = x[1]

            # update histogram
            if 0:
                for xi in range(x[0], x[1] + 1):
                    hist = self.histogram[xi] = self.histogram.get(xi, {})
                    for layer in point[2]:
                        self.histogram_height = max(inc_counter(hist, layer),
                                                    self.histogram_height)
            else:
                for xi in range(x[0], x[1] + 1):
                    hist = self.histogram[xi] = self.histogram.get(xi, {})
                    layers_key = ','.join(
                        ['%s' % li for li in sorted(point[2])])
                    inc_counter(hist, layers_key)

            # convert y to numerical value
            if not isinstance(ys, list):
                ys = [ys]
            for v in ys:
                y = self.bands.get(v, 0)

                # add the points to the stack
                point[0] = x
                point[1] = y
                # convert layers from set to list to allow json serialisation
                point[2] = list(point[2])

                self.stack_point(point)
                points.append(point)

                # increment hits per category
                self.cat_hits[v] = self.cat_hits.get(v, [0, 0][:])
                self.cat_hits[v][0] += 1
                if found:
                    self.cat_hits[v][1] += 1
Esempio n. 5
0
    def segment_unit(self, unit, args):
        patterns = self.get_patterns()
        unit.patterns = []
        found_groups = {}

        rgx_matched = re.compile(ur'[_<>]')

        unit.match_conditions = True

        self.get_plain_content_from_unit(unit)

        for pattern in patterns:
            patternid = pattern['id']
            if not patternid:
                continue
            condition = self.get_condition(pattern)
            if condition == 'ignore':
                continue

            # get regex from pattern
            rgx = self.get_regex_from_pattern(patterns, patternid)
            hilited = patternid in self.options['hilite']

            def markup_segment(match):
                segment = match.group(0)
                if rgx_matched.search(segment, 1):
                    return segment

                # mark it up
                span = '<span class="m ms">' if hilited else '<span class="m">'
                unit.patterns.append([patternid, segment])
                rep = ur'%s%s</span>' % (span, segment.replace(' ', '_'))

                # add variant
                if hilited and ('variants' in self.toreturn):
                    variant = self.get_variant_from_segment(segment)
                    self.variants[variant] = self.variants.get(variant, 0) + 1

                return rep

            # apply regex to unit
            if rgx:
                len_before = len(unit.plain_content)
                unit.plain_content, found = rgx.subn(markup_segment,
                                                     unit.plain_content, 1)
                found = len(unit.plain_content) != len_before

                if (condition == 'include'
                        and not found) or (condition == 'exclude' and found):
                    unit.match_conditions = False
                if found:
                    found_groups[re.sub(ur'-\d+$', '', pattern['key'])] = 1
                    dputils.inc_counter(self.stats['patterns'], pattern['id'],
                                        1)
                else:
                    unit.patterns.append([patternid, ''])

        for group in found_groups:
            self.stats['groups'][group] += 1

        if found_groups:
            unit.plain_content = unit.plain_content.replace('_', ' ')
Esempio n. 6
0
    def draw_internal(self):
        self.context['canvas'] = {'width': 500, 'height': 500}

        self.drawing = {'points': [], 'x': [], 'y': [], 'bar_height': self.bar_height, 'font_size': self.font_size, 'label_margin': self.margin}

        points = self.drawing['points']

        self.drawing['colors'] = [query.get_color() for query in self.queries.get_queries()]

        # {'agreement': [10, 20]}

        self.init_bands()

        from digipal.utils import get_range_from_date, MAX_DATE_RANGE

        # process all records
        #for record in self.get_all_conflated_ids():
        cat_hit = [0] * len(self.queries.get_queries())
        for point in self.points.values():
            found = any(point[2])
            x = point[0]
            ys = point[1]

            # convert x to numerical value
            if self.fields[0]['type'] == 'date':
                x = get_range_from_date(x)
            elif self.fields[0]['key'] == 'locus':
                # 12v => 25
                n = int(x[0:-1]) * 2
                if x[-1] == 'v': n += 1
                x = [n] * 2
            else:
                # ()TODO: other type than date for x
                x = 0

            # turn all x into range
            if not isinstance(x, list):
                x = [x, x]

            # update the min / max x
            if x[0] is not None and x[0] not in MAX_DATE_RANGE and (self.mins[0] is None or self.mins[0] > x[0]):
                self.mins[0] = x[0]
            if x[1] is not None and x[1] not in MAX_DATE_RANGE and (self.maxs[0] is None or self.maxs[0] < x[1]):
                self.maxs[0] = x[1]

            # update histogram
            for xi in range(x[0], x[1]+1):
                hist = self.histogram[xi] = self.histogram.get(xi, {})
                for layer in point[2]:
                    self.histogram_height = max(inc_counter(hist, layer), self.histogram_height)

            # convert y to numerical value
            if not isinstance(ys, list):
                ys = [ys]
            for v in ys:
                y = self.bands.get(v, 0)

                # add the points to the stack
                point[0] = x
                point[1] = y
                # convert layers from set to list to allow json serialisation
                point[2] = list(point[2])

                self.stack_point(point)
                points.append(point)

                # increment hits per category
                self.cat_hits[v] = self.cat_hits.get(v, [0,0][:])
                self.cat_hits[v][0] += 1
                if found:
                    self.cat_hits[v][1] += 1
Esempio n. 7
0
    def draw_internal(self):
        self.context['canvas'] = {'width': 500, 'height': 500}

        self.drawing = {'points': [], 'x': [], 'y': [], 'bar_height': self.bar_height, 'font_size': self.font_size, 'label_margin': self.margin}

        points = self.drawing['points']

        self.drawing['colors'] = [query.get_color() for query in self.queries.get_queries()]
        self.drawing['summaries'] = [query.get_summary() for query in self.queries.get_queries()]

        # {'agreement': [10, 20]}

        self.init_bands()

        # process all records
        #for record in self.get_all_conflated_ids():
        cat_hit = [0] * len(self.queries.get_queries())
        points_order = sorted(self.points.keys(), key=lambda cid: self.points[cid][0][0])
        #for point in self.points.values():
        for cid in points_order:
            point = self.points[cid]
            found = any(point[2])
            x = point[0]
            ys = point[1]

            # update the min / max x
            if x[0] is not None and x[0] not in MAX_DATE_RANGE and (self.mins[0] is None or self.mins[0] > x[0]):
                self.mins[0] = x[0]
            if x[1] is not None and x[1] not in MAX_DATE_RANGE and (self.maxs[0] is None or self.maxs[0] < x[1]):
                self.maxs[0] = x[1]

            # update histogram
            if 0:
                for xi in range(x[0], x[1]+1):
                    hist = self.histogram[xi] = self.histogram.get(xi, {})
                    for layer in point[2]:
                        self.histogram_height = max(inc_counter(hist, layer), self.histogram_height)
            else:
                for xi in range(x[0], x[1]+1):
                    hist = self.histogram[xi] = self.histogram.get(xi, {})
                    layers_key = ','.join(['%s' % li for li in sorted(point[2])])
                    inc_counter(hist, layers_key)

            # convert y to numerical value
            if not isinstance(ys, list):
                ys = [ys]
            for v in ys:
                y = self.bands.get(v, 0)

                # add the points to the stack
                point[0] = x
                point[1] = y
                # convert layers from set to list to allow json serialisation
                point[2] = list(point[2])

                self.stack_point(point)
                points.append(point)

                # increment hits per category
                self.cat_hits[v] = self.cat_hits.get(v, [0,0][:])
                self.cat_hits[v][0] += 1
                if found:
                    self.cat_hits[v][1] += 1