Python Doc.title Exemples

Langage de programmation: Python

Espace de nommage/Pack: opp.models

Class/Type: Doc

Méthode/Fonction: title

Exemples au hotexamples.com: 2

Python Doc.title - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de opp.models.Doc.title extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

content(8)

link(5)

update_db(4)

page(4)

numwords(4)

numpages(4)

load_from_db(4)

meta_confidence(3)

authors(2)

tempfile(2)

title(2)

is_philosophy(1)

is_paper(1)

hidden(1)

found_date(1)

earlier_id(1)

source(1)

assign_category(1)

Méthodes fréquemment utilisées

content (8)

link (5)

update_db (4)

page (4)

numwords (4)

numpages (4)

load_from_db (4)

meta_confidence (3)

authors (2)

tempfile (2)

Méthodes fréquemment utilisées

title (2)

is_philosophy (1)

is_paper (1)

hidden (1)

found_date (1)

earlier_id (1)

source (1)

assign_category (1)

Exemple #1

0

Afficher le fichier

Fichier : test_scraper.py Projet : wo/opp-tools

def test_get_duplicate(testdb): doc = Doc(url='http://umsu.de/papers/driver-2011.pdf') doc.link = Link(url='http://umsu.de/papers/driver-2011.pdf') doc.content = readfile(os.path.join(testdir, 'attitudes.txt')) doc.numwords = 13940 doc.numpages = 26 doc.authors = 'Wolfang Schwarz' doc.title = 'Lost memories and useless coins: Revisiting the absentminded driver' doc.update_db() doc2 = Doc(url='http://download.springer.com/static/pdf/307/art%253A10.1007%252Fs11229-015-0699-z.pdf') doc2.link = Link(url=doc2.url) doc2.content = 'abcdefghjik'+readfile(os.path.join(testdir, 'attitudes.txt')) doc2.numwords = 14130 doc2.numpages = 29 doc2.authors = 'Wolfang Schwarz' doc2.title = 'Lost memories and useless coins: revisiting the absentminded driver' dupe = scraper.get_duplicate(doc2) assert dupe.doc_id == doc.doc_id

Exemple #2

0

Afficher le fichier

Fichier : scraper.py Projet : wo/opp-tools

def process_link(li, force_reprocess=False, redir_url=None, keep_tempfiles=False, recurse=0): """ Fetch url, check for http errors and steppingstones, filter spam, parse candidate papers, check for duplicates, check if published before last year. Links often lead to intermediate pages (e.g. on repositories) with another link to the actual paper. In this case, we only store the original link in the 'links' table, so the 'doc' entry has a url that doesn't match any link. To process the new link, process_link is called again, with redir_url set to the new url and recurse += 1. If force_reprocess is False and the link has already been checked at some point, if_modified_since and etag headers are sent. """ try: li.context = li.html_context() except StaleElementReferenceException: debug(2, "link element has disappeared") return li.update_db(status=1, doc_id=None) debug(2, "link context: %s", li.context) # ignore links to old and published papers: if context_suggests_published(li.context): return li.update_db(status=1, doc_id=None) # fetch url and handle errors, redirects, etc.: url = redir_url or li.url r = li.fetch(url=url, only_if_modified=not(force_reprocess)) # note: li.fetch() updates the link entry in case of errors if not r: return 0 if r.url != url: # redirected # # We generally ignore redirect urls and treat li as if it # directly led to the redirected address. Exception: if the # redirected address is unmanageably long, as on Barry Smith's # page. if len(r.url) < 500: url = util.normalize_url(r.url) if r.filetype not in ('html', 'pdf', 'doc', 'rtf'): li.update_db(status=error.code['unsupported filetype']) return debug(1, "unsupported filetype: %s", r.filetype) doc = Doc(url=url, r=r, link=li, source=li.source) if doc.load_from_db() and not force_reprocess: li.update_db(status=1, doc_id=doc.doc_id) return debug(1, "%s is already in docs table", url) if r.filetype == 'html': r.encoding = 'utf-8' try: doc.page = Webpage(url, html=r.text) except UnparsableHTMLException: li.update_db(status=error.code['unsupported filetype']) return debug(1, "unparsable html") debug(6, "\n====== %s ======\n%s\n======\n", url, r.text) # check for steppingstone pages with link to a paper: target_url = check_steppingstone(doc.page) if target_url and recurse < 3: debug(1, "steppingstone to %s", target_url) return process_link(li, redir_url=target_url, force_reprocess=force_reprocess, recurse=recurse+1) # Genuine papers are almost never in HTML format, and almost # every HTML page is not a paper. The few exceptions (such as # entries on SEP) tend to require special parsing. Hence the # following special treatment. If people start posting # articles on medium or in plain HTML, we might return to the # old procedure of converting the page to pdf and treating it # like any candidate paper. from .docparser import webpageparser as htmlparser if not htmlparser.parse(doc): debug(1, "page ignored") li.update_db(status=1) return 0 else: try: doc.tempfile = save_local(r) except: return li.update_db(status=error.code['cannot save local file']) try: # metadata extraction: process_file(doc, keep_tempfiles=keep_tempfiles) except Exception as e: debug(1, 'could not process %s: %s', doc.tempfile, e) return li.update_db(status=error.code.get(str(e), 10)) # estimate whether doc is a handout, cv etc.: from .doctyper import paperfilter paperprob = paperfilter.evaluate(doc) doc.is_paper = int(paperprob * 100) if doc.is_paper < 25: li.update_db(status=1) debug(1, "spam: paper score %s < 50", doc.is_paper) return 0 # estimate whether doc is on philosophy: from .doctyper import classifier philosophyfilter = classifier.get_classifier('philosophy') try: doc.is_philosophy = int(philosophyfilter.classify(doc) * 100) except UntrainedClassifierException as e: doc.is_philosophy = 90 if doc.is_philosophy < 25: li.update_db(status=1) debug(1, "spam: philosophy score %s < 50", doc.is_philosophy) return 0 if li.doc_id: # check for revisions: olddoc = Doc(doc_id=li.doc_id) olddoc.load_from_db() if doc.content != olddoc.content: sm = SequenceMatcher(None, doc.content, olddoc.content) match_ratio = sm.ratio() if match_ratio < 0.8: debug(1, "substantive revisions, ratio %s", match_ratio) doc.earlier_id = olddoc.doc_id if not doc.earlier_id: li.update_db(status=1) debug(1, "no substantive revisions") return 0 else: # check for duplicates: dupe = get_duplicate(doc) if dupe: debug(1, "duplicate of document %s", dupe.doc_id) li.update_db(status=1, doc_id=dupe.doc_id) return 0 # ignore old and published paper: if paper_is_old(doc): li.update_db(status=1, doc_id=None) debug(1, "ignoring already published paper") return 0 # flag for manual approval if confidence low or dubious relevance: if doc.is_paper < 60 or doc.is_philosophy < 60 or doc.meta_confidence < 60: debug(1, "flagging for manual approval") doc.hidden = True # don't show papers (incl HTML pages) from newly added source # pages in news feed: if doc.source.status == 0: debug(2, "new source page: setting found_date to 1970") doc.found_date = datetime(1970, 1, 1) # make sure doc fits in db: if len(doc.title) > 255: doc.title = doc.title[:251]+'...' if len(doc.authors) > 255: doc.authors = doc.authors[:251]+'...' doc.update_db() li.update_db(status=1, doc_id=doc.doc_id) # categorize, but only if doc has more than 1000 words -- # otherwise categorization is pretty random: if doc.numwords > 700: for (cat_id, cat) in categories(): clf = classifier.get_classifier(cat) try: strength = int(clf.classify(doc) * 100) debug(3, "%s score %s", cat, strength) except UntrainedClassifierException as e: continue doc.assign_category(cat_id, strength) return 1