Beispiel #1
0
 def store(self, name, content, overwrite=False):
     t = self.d / name
     assert overwrite or not t.exists(), name + ' already exists!'
     with file(t, 'wb') as f:
         content = force_unicode(content)
         content = content.encode('utf8')
         f.write(content)
         f.write('\n')  # new line at end of file
     return content
Beispiel #2
0
 def note_template(self, x):
     x = unicodify_dict(x)
     others = set(x) - set(
         'title author year source cached tags notes'.split())
     attrs = u'\n'.join(
         (u':%s: %s' % (k, x[k])).strip() for k in others).strip()
     if attrs:
         attrs += '\n'
     newdata = TEMPLATE.format(attrs=attrs, **x)
     return force_unicode(newdata).encode('utf8')
Beispiel #3
0
def robust_read_string(x, verbose=0):
    detector = UniversalDetector()
    #for line in StringIO(x):
    detector.feed(x)
    #if detector.done:
    #    break
    detector.close()
    if verbose:
        print 'encoding:', detector.result
    encoding = detector.result['encoding'] or 'utf8'
    return force_unicode(x.decode(encoding, 'replace').encode('utf8'))
Beispiel #4
0
def robust_read(filename, verbose=0):
    detector = UniversalDetector()
    for line in file(filename):
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    if verbose:
        print 'encoding:', detector.result
    encoding = detector.result['encoding'] or 'utf8'
    with file(filename) as f:
        return force_unicode(f.read().decode(encoding,
                                             'replace').encode('utf8'))
Beispiel #5
0
    def extract_plaintext(self):
        "Extract plaintext from filename. Returns text, might cache."

        if self.cached.endswith('.pdf'):
            # extract text from pdfs
            text = pdftotext(self.cached,
                             output=self.d / 'data' / 'pdftotext.txt',
                             verbose=True,
                             usecached=True)

        else:
            text = robust_read(self.cached)
            text = force_unicode(text)
            text = htmltotext(text)  # clean up html

        text = remove_ligatures(text)

        return self.store('data/text', text, overwrite=True)
Beispiel #6
0
    def __init__(self, raw):
        self._raw = raw
        self.raw = force_unicode(raw.strip())
        self.styles = {}

        bibliography = bibtex.Parser().parse_stream(StringIO(self.raw))
        entries = bibliography.entries
        assert len(entries) == 1, 'Entry is supposed to represent only one BibTex entry.'

        self.key, self.entry = entries.items()[0]
        self.fields = self.entry.fields

        for role, people in self.entry.persons.items():
            self.fields[role] = people

        assert len(self.entry.persons) <= 2, 'ERROR: too people.'

        for k in self.fields:
            fields[k].append(self)
Beispiel #7
0
    def __init__(self, raw):
        self._raw = raw
        self.raw = force_unicode(raw.strip())
        self.styles = {}

        bibliography = bibtex.Parser().parse_stream(StringIO(self.raw))
        entries = bibliography.entries
        assert len(entries) == 1, 'Entry is supposed to represent only one BibTex entry.'

        self.key, self.entry = list(entries.items())[0]
        self.fields = self.entry.fields

        for role, people in list(self.entry.persons.items()):
            self.fields[role] = people

        assert len(self.entry.persons) <= 2, 'ERROR: too people.'

        for k in self.fields:
            fields[k].append(self)
Beispiel #8
0
def uni(x):
    if isinstance(x, list):
        return map(uni, x)
    assert isinstance(x, basestring), x
    return force_unicode(x)