Exemple #1
0
def build_query(location=None,
                query_string=None,
                idx=None,
                text_line_threshold=15,
                bin_line_threshold=50):
    """
    Return a Query built from location or query string given an index.
    """
    if location:
        T = typecode.get_type(location)
        # TODO: implement additional type-driven heuristics for query chunking.
        if not T.contains_text:
            return
        if T.is_binary:
            # for binaries we want to avoid a large number of query runs as the
            # license context is often very sparse or absent
            qry = Query(location=location,
                        idx=idx,
                        line_threshold=bin_line_threshold)
        else:
            # for text
            qry = Query(location=location,
                        idx=idx,
                        line_threshold=text_line_threshold)
    else:
        # a string is always considered text
        qry = Query(query_string=query_string, idx=idx)

    return qry
Exemple #2
0
def convert_to_utf8(location):
    """
    Convert the file at location to UTF-8 text.
    Return the location of the converted file or None.
    """
    if not get_type(location).is_text:
        return location
    start = open(location, 'rb').read(4096)
    encoding = chardet.detect(start)
    if encoding:
        encoding = encoding.get('encoding', None)
        if encoding:
            target = os.path.join(fileutils.get_temp_dir('markup'),
                                  fileutils.file_name(location))
            with codecs.open(location,
                             'rb',
                             encoding=encoding,
                             errors='replace',
                             buffering=16384) as inf:
                with codecs.open(target, 'wb', encoding='utf-8') as outf:
                    outf.write(inf.read())
            return target
        else:
            # chardet failed somehow to detect an encoding
            return location
Exemple #3
0
def unicode_text_lines_from_binary(location):
    """
    Return an iterable over unicode text lines extracted from a binary file at
    location.
    """
    T = typecode.get_type(location)
    if T.contains_text:
        for line in strings.strings_from_file(location):
            yield line
def unicode_text_lines_from_binary(location):
    """
    Return an iterable over unicode text lines extracted from a binary file at
    location.
    """
    T = typecode.get_type(location)
    if T.contains_text:
        for line in strings.strings_from_file(location):
            yield line
Exemple #5
0
def unicode_text_lines(location):
    """
    Return an iterable over unicode text lines from a text file at location.
    Open the file as binary with universal new lines then try to decode each
    line as Unicode.
    """
    T = typecode.get_type(location)
    if T.contains_text:
        with open(location, 'rbU') as f:
            for line in f:
                yield remove_verbatim_cr_lf_tab_chars(as_unicode(line))
def unicode_text_lines(location):
    """
    Return an iterable over unicode text lines from a text file at location.
    Open the file as binary with universal new lines then try to decode each
    line as Unicode.
    """
    T = typecode.get_type(location)
    if T.contains_text:
        with open(location, 'rbU') as f:
            for line in f:
                yield remove_verbatim_cr_lf_tab_chars(as_unicode(line))
Exemple #7
0
def is_markup(location):
    """
    Return True is the file at `location` is some kind of markup, such as HTML,
    XML, PHP, etc.
    """
    T = get_type(location)

    # do not care for small files
    if T.size < 64:
        return False

    if not T.is_text:
        return False

    if location.endswith(extensions):
        return True

    with open(location, 'rb') as f:
        start = as_unicode(f.read(1024))

    if start.startswith('<'):
        return True

    # count whitespaces
    no_spaces = ''.join(start.split())

    # count opening and closing tags_count
    counts = Counter(c for c in no_spaces if c in '<>')

    if not all(c in counts for c in '<>'):
        return False

    if not all(counts.values()):
        return False

    # ~ 5 percent of tag <> markers
    has_tags = sum(counts.values()) / len(no_spaces) > 0.05

    # check if we have some significant proportion of tag-like characters
    open_close = counts['>'] / counts['<']
    # ratio of open to close tags should approach 1: accept a 20% drift
    balanced = abs(1 - open_close) < .2
    return has_tags and balanced
Exemple #8
0
def build_query(location=None, query_string=None, idx=None):
    """
    Return a Query built from location or query string given an index.
    """
    if location:
        T = typecode.get_type(location)
        # TODO: implement additional type-driven heuristics for query chunking.
        if not T.contains_text:
            return
        if T.is_binary:
            # for binaries we want to avoid a large number of query runs as the
            # license context is often very sparse or absent
            qry = Query(location=location, idx=idx, line_threshold=1000)
        else:
            # for text
            qry = Query(location=location, idx=idx, line_threshold=80)
    else:
        # a string is always considered text
        qry = Query(query_string=query_string, idx=idx)

    return qry
Exemple #9
0
    def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4):
        """
        Tokenize this query and populate tokens and query_runs at each break
        point. Only keep known token ids but consider unknown token ids to break
        a query in runs.

        `tokens_by_line` is the output of the self.tokens_by_line() method and is an
        iterator of lines (eg. list) of token ids.
        `line_threshold` is the number of empty or junk lines to break a new run.
        """
        len_junk = self.idx.len_junk
        digit_only_tids = self.idx.digit_only_tids

        # initial query run
        query_run = QueryRun(query=self, start=0)

        # break in runs based on threshold of lines that are either empty, all
        # unknown, all low id/junk jokens or made only of digits.
        empty_lines = 0

        # token positions start at zero
        pos = 0

        # bind frequently called functions to local scope
        tokens_append = self.tokens.append
        query_runs_append = self.query_runs.append

        if self.location:
            ft = typecode.get_type(self.location)
            if ft.is_text_with_long_lines:
                tokens_by_line = break_long_lines(tokens_by_line)

        for tokens in tokens_by_line:
            # have we reached a run break point?
            if len(query_run) > 0 and empty_lines >= line_threshold:
                query_runs_append(query_run)
                # start new query run
                query_run = QueryRun(query=self, start=pos)
                empty_lines = 0

            if len(query_run) == 0:
                query_run.start = pos

            if not tokens:
                empty_lines += 1
                continue

            line_has_known_tokens = False
            line_has_good_tokens = False
            line_is_all_digit = all(
                [tid is None or tid in digit_only_tids for tid in tokens])

            for token_id in tokens:
                if token_id is not None:
                    tokens_append(token_id)
                    line_has_known_tokens = True
                    if token_id >= len_junk:
                        line_has_good_tokens = True
                    query_run.end = pos
                    pos += 1

            if line_is_all_digit:
                # close current run and start new query run
                empty_lines += 1
                continue

            if not line_has_known_tokens:
                empty_lines += 1
                continue

            if line_has_good_tokens:
                empty_lines = 0
            else:
                empty_lines += 1

        # append final run if any
        if len(query_run) > 0:
            if not all(tid in digit_only_tids for tid in query_run.tokens):
                query_runs_append(query_run)

        if TRACE_QR:
            print()
            logger_debug('Query runs for query:', self.location)
            for qr in self.query_runs:
                high_matchables = len(
                    [p for p, t in enumerate(qr.tokens) if t >= len_junk])

                print(' ', repr(qr), 'high_matchables:', high_matchables)
            print()
Exemple #10
0
def text_lines(location, demarkup=False):
    """
    Return a text lines iterator from file at `location`. Return an empty
    iterator if no text content is extractible. Text extraction is based on
    detected file type.

    if `demarkup` is True, attempt to detect if a file contains HTML/XML-like
    markup and cleanup this markup.

    Note: For testing or building from strings, location can be a is a list of
    unicode line strings.
    """
    # TODO: add support for "wide" UTF-16-like strings where each char is
    # followed by a zero as is often found in some Windows binaries. Do this for
    # binaries only. This is in direct conflict with "strings" extraction as
    # currently implemented

    if not location:
        return iter([])

    if not isinstance(location, basestring):
        # not a path: wrap an iterator on location which should be a sequence
        # of lines
        return iter(location)

    T = typecode.get_type(location)

    if not T.contains_text:
        return iter([])

    # Should we read this as some markup, pdf office doc, text or binary?
    if T.is_pdf:
        return unicode_text_lines_from_pdf(location)

    # lightweight markup stripping support
    if demarkup and markup.is_markup(location):
        try:
            return markup.demarkup(location)
        except:
            # try again later with as plain text
            pass

    # TODO: handle Office-like documents, RTF, etc
    # if T.is_doc:
    #     return unicode_text_lines_from_doc(location)

    if T.is_text:
        return unicode_text_lines(location)

    # DO NOT introspect media, archives and compressed files
#    if not T.contains_text:
#        return iter([])

    if T.is_binary:
        # fall back to binary
        return unicode_text_lines_from_binary(location)

    else:
        # if neither text, text-like nor binary: treat as binary
        # this should never happen
        # fall back to binary
        return unicode_text_lines_from_binary(location)
Exemple #11
0
    def _tokenize_and_build_runs(self, tokens_by_line, line_threshold=4):
        len_legalese = self.idx.len_legalese
        digit_only_tids = self.idx.digit_only_tids

        # initial query run
        query_run = QueryRun(query=self, start=0)

        # break in runs based on threshold of lines that are either empty, all
        # unknown, all low id/junk jokens or made only of digits.
        empty_lines = 0

        # token positions start at zero
        pos = 0

        # bind frequently called functions to local scope
        tokens_append = self.tokens.append
        query_runs_append = self.query_runs.append

        if self.location:
            ft = typecode.get_type(self.location)
            if ft.is_text_with_long_lines:
                self.has_long_lines = True
                tokens_by_line = break_long_lines(tokens_by_line)
            if ft.is_binary:
                self.is_binary = True

        for tokens in tokens_by_line:
            # have we reached a run break point?
            if len(query_run) > 0 and empty_lines >= line_threshold:
                query_runs_append(query_run)
                # start new query run
                query_run = QueryRun(query=self, start=pos)
                empty_lines = 0

            if len(query_run) == 0:
                query_run.start = pos

            if not tokens:
                empty_lines += 1
                continue

            line_has_known_tokens = False
            line_has_good_tokens = False
            line_is_all_digit = all([tid is None or tid in digit_only_tids for tid in tokens])

            for token_id in tokens:
                if token_id is not None:
                    tokens_append(token_id)
                    line_has_known_tokens = True
                    if token_id < len_legalese:
                        line_has_good_tokens = True
                    query_run.end = pos
                    pos += 1

            if line_is_all_digit:
                # close current run and start new query run
                empty_lines += 1
                continue

            if not line_has_known_tokens:
                empty_lines += 1
                continue

            if line_has_good_tokens:
                empty_lines = 0
            else:
                empty_lines += 1

        # append final run if any
        if len(query_run) > 0:
            if not all(tid in digit_only_tids for tid in query_run.tokens):
                query_runs_append(query_run)

        if TRACE_QR:
            print()
            logger_debug('Query runs for query:', self.location)
            for qr in self.query_runs:
                high_matchables = len([p for p, t in enumerate(qr.tokens) if t < len_legalese])

                print(' ' , repr(qr), 'high_matchables:', high_matchables)
            print()
Exemple #12
0
def numbered_text_lines(
    location,
    demarkup=False,
    plain_text=False,
    start_line=1,
):
    """
    Yield tuples of (line number, text line) from the file at `location`. Return
    an empty iterator if no text content is extractible. Text extraction is
    based on detected file type. Long lines are broken down in chunks, therefore
    two items can have the same line number.

    line numbers start at ``start_line`` which is 1-based by default.

    If `demarkup` is True, attempt to detect if a file contains HTML/XML-like
    markup and cleanup this markup.

    If `plain_text` is True treat the file as a plain text file and do not
    attempt to detect its type and extract it's content with special procedures.
    This is used mostly when loading license texts and rules.

    Note: For testing or building from strings, location can be a is a list of
    unicode line strings.
    """
    if not location:
        return iter([])

    if not isinstance(location, str):
        # not a path: wrap an iterator on location which should be a sequence
        # of lines
        if TRACE:
            logger_debug('numbered_text_lines:', 'location is not a file')
        return enumerate(iter(location), start_line)

    if plain_text:
        if TRACE:
            logger_debug('numbered_text_lines:', 'plain_text')
        return enumerate(unicode_text_lines(location), start_line)

    T = typecode.get_type(location)

    if TRACE:
        logger_debug('numbered_text_lines: T.filetype_file:', T.filetype_file)
        logger_debug('numbered_text_lines: T.is_text_with_long_lines:',
                     T.is_text_with_long_lines)
        logger_debug('numbered_text_lines: T.is_binary:', T.is_binary)

    # TODO: we should have a command line to force digging inside binaries
    if not T.contains_text:
        return iter([])

    # Should we read this as some markup, pdf office doc, text or binary?
    if T.is_pdf and T.is_pdf_with_text:
        if TRACE:
            logger_debug('numbered_text_lines:', 'is_pdf')
        return enumerate(unicode_text_lines_from_pdf(location), start_line)

    if T.filetype_file.startswith('Spline Font Database'):
        if TRACE:
            logger_debug('numbered_text_lines:', 'Spline Font Database')
        return enumerate(
            (as_unicode(l) for l in sfdb.get_text_lines(location)),
            start_line,
        )

    # lightweight markup stripping support
    if demarkup and markup.is_markup(location):
        try:
            lines = list(enumerate(markup.demarkup(location), start_line))
            if TRACE:
                logger_debug('numbered_text_lines:', 'demarkup')
            return lines
        except:
            # try again later with as plain text
            pass

    if T.is_js_map:
        try:
            lines = list(enumerate(js_map_sources_lines(location), start_line))
            if TRACE:
                logger_debug('numbered_text_lines:', 'js_map')
            return lines
        except:
            # try again later with as plain text otherwise
            pass

    if T.is_text:
        numbered_lines = enumerate(unicode_text_lines(location), start_line)
        # text with very long lines such minified JS, JS map files or large JSON
        if (not location.endswith('package.json')
                and (T.is_text_with_long_lines or T.is_compact_js
                     or T.filetype_file == 'data' or 'locale' in location)):

            numbered_lines = break_numbered_unicode_text_lines(numbered_lines)
            if TRACE:
                logger_debug('numbered_text_lines:',
                             'break_numbered_unicode_text_lines')
        return numbered_lines

    # TODO: handle Office-like documents, RTF, etc
    # if T.is_doc:
    #     return unicode_text_lines_from_doc(location)

    # TODO: add support for "wide" UTF-16-like strings where each char is
    # followed by a zero as is often found in some Windows binaries. Do this for
    # binaries only. This is may conflicting  with "strings" extraction as
    # currently implemented
    if T.is_binary:
        # fall back to binary
        if TRACE:
            logger_debug('numbered_text_lines:', 'is_binary')

        return enumerate(unicode_text_lines_from_binary(location), start_line)

    return iter([])
def text_lines(location, demarkup=False):
    """
    Return a text lines iterator from file at `location`. Return an empty
    iterator if no text content is extractible. Text extraction is based on
    detected file type.

    if `demarkup` is True, attempt to detect if a file contains HTML/XML-like
    markup and cleanup this markup.

    Note: For testing or building from strings, location can be a is a list of
    unicode line strings.
    """
    # TODO: add support for "wide" UTF-16-like strings where each char is
    # followed by a zero as is often found in some Windows binaries. Do this for
    # binaries only. This is in direct conflict with "strings" extraction as
    # currently implemented

    if not location:
        return iter([])

    if not isinstance(location, basestring):
        # not a path: wrap an iterator on location which should be a sequence
        # of lines
        return iter(location)

    T = typecode.get_type(location)

    if not T.contains_text:
        return iter([])

    # Should we read this as some markup, pdf office doc, text or binary?
    if T.is_pdf:
        return unicode_text_lines_from_pdf(location)

    # lightweight markup stripping support
    if demarkup and markup.is_markup(location):
        try:
            return markup.demarkup(location)
        except:
            # try again later with as plain text
            pass

    # TODO: handle Office-like documents, RTF, etc
    # if T.is_doc:
    #     return unicode_text_lines_from_doc(location)

    if T.is_text:
        return unicode_text_lines(location)

    # DO NOT introspect media, archives and compressed files
#    if not T.contains_text:
#        return iter([])

    if T.is_binary:
        # fall back to binary
        return unicode_text_lines_from_binary(location)

    else:
        # if neither text, text-like nor binary: treat as binary
        # this should never happen
        # fall back to binary
        return unicode_text_lines_from_binary(location)