Example #1
0
def parse_string(string, pattern, path=False):
    s = re.split('{(.*?)}', pattern)
    counts = {}
    for i in range(0, len(s), 2):
        s[i] = re.escape(s[i])
    for i in range(1, len(s), 2):
        item = s[i].split(' ', 1)
        key = item[0]
        if len(item) == 1:
            c = '[^' + re.escape(sep) + ']' if path else '.'
            fmt = "(?P<%s%i>{}*?)".format(c)
        else:
            escaped_item = re.escape(item[1])
            fmt = "(?P<%s%i>{})?".format(escaped_item)

        if key not in counts:
            counts[key] = 0
        counts[key] += 1

        s[i] = fmt % (key, counts[key])
    regex_pattern = ''.join(s)
    s = re.search(regex_pattern, string)
    if not s:
        return None

    results = {}
    for key, value in iter(s.groupdict().items()):
        k = re.sub('\d', '', key)
        if k in results and results[k] != value:
            raise ParseError("Problem parsing string '%s'" % string)
        results[k] = value

    return results
Example #2
0
    def _parse_time(self, timestr, fmt):
        match = re.match(self.TIMESTR_RE, timestr)
        try:
            ts1, ts2 = match.groups()
            ts2 = ts2.replace(':', '')
        except:
            raise ParseError("problem parsing time string %s" % timestr)

        dt = datetime.datetime.strptime(ts1, fmt)
        return dt.replace(tzinfo=getoffset(None, ts2))
Example #3
0
    def _parse_line(self, line, conversation, source, transformed_source):
        """Return (cons, attrs)"""
        status_html = []
        attrs = {}
        cons = None

        for elem in BeautifulSoup(line, ['lxml', 'xml']).children:
            if isinstance(elem, Comment):
                alternate, status_html = elem.split('|', 1)
                attrs['alternate'] = True if alternate else False
                status_html = [NavigableString(status_html)]
                continue

            for key in ('alias', 'sender', 'auto', 'time'):
                attrs[key] = elem.get(key, '')

            if attrs['sender'] == source:
                attrs['sender'] = transformed_source
                attrs['isuser'] = True
            else:
                attrs['isuser'] = False

            attrs['auto'] = bool(attrs['auto'])
            if attrs['time']:
                fmt = self.STRPTIME_FMT_CONVERSATION
                attrs['time'] = self._parse_time(attrs['time'], fmt)

            attrs['html'] = list(elem.children)

            if elem.name == 'status':
                cons = Status
                attrs['type'] = self.STATUS_TYPEMAP.get(elem.get('type'), None)
                if attrs['type'] in Status.USER_TYPES:
                    attrs['msg_html'] = attrs['html']
                    attrs['html'] = status_html
            elif elem.name == 'event':
                cons = Event
                attrs['type'] = self.EVENT_TYPEMAP.get(elem.get('type'), None)
            elif elem.name == 'message':
                cons = Message
            else:
                raise TypeError("unknown type '%s' for entry" % elem.name)

            if not attrs['sender'] and not attrs['alias']:
                print_d("%s is a system entry" % elem)
                attrs['system'] = True

        if not cons:
            raise (ParseError("could not parse line: '%s'" % line))

        return cons, attrs
Example #4
0
    def parse_conversation(self, conversation):
        lines = conversation.lines
        xml_header = lines.pop(0)
        conversation.original_parser_name = self.type
        for e in BeautifulSoup(lines.pop(0), ['lxml', 'xml']).children:
            if isinstance(e, Comment):
                conversation.original_parser_name = e.split('/')[1]
            else:
                service = self.SERVICE_MAP[e.get('service')]
                source = e.get('account')
                conversation.resource = e.get('resource')
                transformed_source = \
                    self.TRANSFORMS['source'](source, conversation)

        if transformed_source != conversation.source or \
                service != conversation.service:
            raise ParseError("mismatch between path and chatinfo for '%s" %
                             conversation.path)

        latest_time = conversation.time
        for line in lines:
            if line == "</chat>":
                continue
            cons, attrs = self._parse_line(line, conversation, source,
                                           transformed_source)
            if attrs['time'] < latest_time:
                attrs['delayed'] = True
            else:
                latest_time = attrs['time']

            try:
                conversation.entries.append(cons(**attrs))
            except Exception as err:
                print_e("Problem with element %s" % e)
                raise err

        return conversation
Example #5
0
    def write(self, path, conversations):
        if len(conversations) != 1:
            raise ParseError(("'%s' only supports one conversation "
                              "per file:\n  '%s' has %i") %
                             (self.type, path, len(conversations)))

        conversation = conversations[0]
        file_object = codecs.open(path, 'wb', 'utf-8')
        util.write_comment(
            file_object,
            const.HEADER_COMMENT % conversation.original_parser_name)
        self._write_title(file_object, conversation)

        for entry in conversation.entries:
            timefmt = self.TIME_FMT_CONVERSATION_WITH_DATE if entry.delayed \
                else self.TIME_FMT_CONVERSATION
            self._write_entry(file_object, entry, conversation, timefmt)
            file_object.write('\n')

        # newline at end
        file_object.write('</body></html>\n')
        file_object.close()

        self.copy_images(path, conversation)
Example #6
0
    def _parse_line(self, line, conversation, base_time):
        """Return (cons, attrs)"""
        attrs = dict(alias=None, time=None, sender=None, type=None, html=None)
        line, comment = self._get_line_data(line)
        if not line and not comment:
            print_d("Skipping line %s" % line)
            return None, None

        # unrepresentable entry dump
        if not line:
            cons, attrs = Entry.from_dump(comment)
            return cons, attrs

        matched = False
        for regex in (self.MESSAGE_LINE_RE, self.STATUS_LINE_RE,
                      self.ERROR_LINE_RE):
            m = regex.match(line)
            if m:
                matched = True
                break

        if not matched:
            raise ParseError("could not parse line '%s'" % line)
        # Message
        elif regex == self.MESSAGE_LINE_RE:
            color = m.group('color')
            attrs['alternate'] = color == self.ALTERNATE_COLOR
            timestr = m.group('time')
            attrs['alias'] = m.group('name')
            attrs['auto'] = m.group('auto')
            htmlstr = m.group('html')

            if color == self.SOURCE_COLOR:
                attrs['sender'] = conversation.source
                attrs['isuser'] = True
            elif conversation.isgroup:  # groupchats don't use aliases
                attrs['sender'] = comment if comment else attrs['alias']
            elif color == self.DESTINATION_COLOR:
                attrs['sender'] = conversation.destination
                attrs['isuser'] = False

            cons = Message
        # Status
        elif regex == self.STATUS_LINE_RE:
            timestr = m.group('time')
            htmlstr = m.group('html')
            cons = Status
        # Error
        elif regex == self.ERROR_LINE_RE:
            timestr = m.group('time')
            htmlstr = m.group('html')
            attrs['color'] = self.ERROR_COLOR
            cons = Status
            attrs['type'] = Status.ERROR

        parsed = parse(timestr, default=datetime.datetime.min, ignoretz=True)
        # delayed has full date in timestamp
        if parsed.date() == datetime.date.min:
            attrs['delayed'] = False
            attrs['time'] = parsed.replace(day=base_time.day,
                                           month=base_time.month,
                                           year=base_time.year,
                                           tzinfo=base_time.tzinfo)
        else:
            attrs['delayed'] = True
            attrs['time'] = parsed.replace(tzinfo=base_time.tzinfo)


        attrs['html'] = \
            list(BeautifulSoup('<foo>%s</foo>' % htmlstr).foo.children)

        # parse status
        if cons == Status:
            self._parse_status(comment, attrs, conversation)
            if not attrs['type']:
                print_d("No type found for status '%s': using SYSTEM" % line)
                attrs['type'] = Status.SYSTEM

        return (cons, attrs)
Example #7
0
    def parse_conversation(self, conversation):
        with codecs.open(conversation.path, encoding='utf-8') as f:
            data = f.read().strip()
            lines = data.split('\n')
            if not lines[-1]:
                del lines[-1]
            if lines[-1].endswith('</html>'):
                del lines[-1]
        title_line, comment = self._get_line_data((lines.pop(0)))
        info, conversation.original_parser_name = \
            self._parse_title(title_line, comment, conversation)

        for k, v in iter(info.items()):
            # no way to determine resource without reading file first
            if k == 'resource':
                conversation.resource = v
            cv = getattr(conversation, k)
            if v != cv:
                raise ParseError("mismatch between filename and header "
                                 "%s: '%s' != '%s'" % (k, v, cv))

        prev_time = conversation.time
        senders_by_alias = {}
        ignore_aliases = set()
        attrs_list = []
        i = 0

        while i < len(lines):
            line = lines[i]
            while True:
                if line.endswith('<br/>') or line.startswith('<!--'):
                    break
                line += '\n' + lines[i + 1]
                i += 1
            i += 1

            try:
                cons, attrs = self._parse_line(line, conversation, prev_time)
            except ArgumentError as e:
                print_e('Error on line %s' % line)
                raise e

            if not attrs:
                continue

            if attrs['time'] < prev_time and not attrs['delayed']:
                attrs['time'] += datetime.timedelta(days=1)
            prev_time = attrs['time']

            s = attrs['sender']
            a = attrs['alias']
            if s and a and a not in ignore_aliases:
                s2 = senders_by_alias.get(a, s)
                if s != s2:
                    print_d('Multiple senders found for %s (%s)' %
                            (a, '%s, %s' % (s, s2)))
                    ignore_aliases.add(a)
                    del senders_by_alias[a]
                senders_by_alias[a] = s

            attrs_list.append((cons, attrs))

        conversation.entries, conversation.images = \
            self._get_entries_and_images(conversation, senders_by_alias,
                                         attrs_list)

        return conversation
Example #8
0
    def write(self, path, conversations):
        if len(conversations) != 1:
            raise ParseError(
                ("'%s' only supports one conversation per file:"
                 "\n  %s has %i") % (self.type, path, len(conversations)))
        conversation = conversations[0]

        file_object = codecs.open(path, 'wb', 'utf-8')
        file_object.write(self.XML_HEADER + '\n')
        untransformed_source = self.UNTRANSFORMS['source'](conversation.source,
                                                           conversation)
        attrs = dict(xmlns=self.XMLNS,
                     account=untransformed_source,
                     service=self.PAM_ECIVRES[conversation.service],
                     resource=conversation.resource)

        # this attribute will only be useful if we're not the original parser
        if conversation.isgroup and \
                conversation.original_parser_name != self.type:
            attrs['groupchat'] = "true"

        util.write_comment(
            file_object,
            const.HEADER_COMMENT % conversation.original_parser_name)
        self._write_xml(file_object, 'chat', attrs, conversation, close=False)
        file_object.write('\n')

        for i, entry in enumerate(conversation.entries):
            attrs = {
                'alias':
                entry.alias,
                'sender': (untransformed_source if entry.sender
                           == conversation.source else entry.sender)
            }
            if isinstance(entry, Message):
                name = 'message'
                if entry.auto:
                    attrs['auto'] = "true"
            elif isinstance(entry, Status):
                name = 'status'
                attrs['type'] = self.PAMEPYT_SUTATS[entry.type]
            elif isinstance(entry, Event):
                name = 'event'
                attrs['type'] = self.PAMEPYT_TNEVE[entry.type]
                # no alias for event
                attrs['alias'] = ''

            if entry.system:  # no alias or sender for these
                del attrs['alias']
                del attrs['sender']
            elif not attrs['alias']:
                del attrs['alias']

            f1 = self.TIME_FMT_CONVERSATION[:-2]
            f2 = self.TIME_FMT_CONVERSATION[-2:]
            v1 = entry.time.strftime(f1)
            v2 = entry.time.strftime(f2)
            v = v1 + v2[:3] + ':' + v2[3:]
            attrs['time'] = v

            # comments should look like 1|status text
            comment = ['1', ''] if entry.alternate else ['', '']

            if isinstance(entry, Status) and entry.type in Status.USER_TYPES:
                htmlattr = 'msg_html'
                if entry.has_other_html:
                    comment[1] = ''.join([x.string for x in entry.html])
            else:
                htmlattr = 'html'

            if [x for x in comment if x]:
                util.write_comment(file_object, '|'.join(comment))

            self._write_xml(file_object,
                            name,
                            attrs,
                            conversation,
                            contents=getattr(entry, htmlattr))
            if i != len(conversation.entries) - 1:
                file_object.write('\n')

        file_object.write('</chat>')
        file_object.close()

        self.copy_images(path, conversation)