Python Fetcher.tokenize_html Exemples

Langage de programmation: Python

Espace de nommage/Pack: fetcher

Class/Type: Fetcher

Méthode/Fonction: tokenize_html

Exemples au hotexamples.com: 2

Python Fetcher.tokenize_html - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de fetcher.Fetcher.tokenize_html extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

Fetcher(30)

fetch(14)

stop(7)

_get_helper(5)

mark_paragraphs(3)

insertDataToMySQL(2)

tokenize_html(2)

children(2)

get(2)

raw_fetch_url(2)

make_soup(2)

host(1)

populate_products_has_tags(1)

header(1)

populate_products(1)

populate_categories(1)

len(1)

movie_inf(1)

login(1)

left(1)

install_modpack(1)

insert_thumbs(1)

populate_tags(1)

resample(1)

query_video_information(1)

statistics(1)

url_valid(1)

url(1)

update_thumbs(1)

update_modpack(1)

to_decoder(1)

sync_topic_to_es(1)

start(1)

request(1)

setReferer(1)

setCredentials(1)

send(1)

search_movie(1)

runFetcher(1)

run(1)

get_subscriptions(1)

get_user_data(1)

get_latest_season(1)

get_ranked_pages(1)

coutries_data(1)

extract_dns(1)

entries(1)

download(1)

downLoadContent(1)

create_table(1)

Méthodes fréquemment utilisées

Fetcher (30)

fetch (14)

stop (7)

_get_helper (5)

mark_paragraphs (3)

insertDataToMySQL (2)

tokenize_html (2)

children (2)

get (2)

raw_fetch_url (2)

Méthodes fréquemment utilisées

make_soup (2)

host (1)

populate_products_has_tags (1)

header (1)

populate_products (1)

populate_categories (1)

len (1)

movie_inf (1)

login (1)

left (1)

install_modpack (1)

insert_thumbs (1)

populate_tags (1)

resample (1)

query_video_information (1)

statistics (1)

url_valid (1)

url (1)

update_thumbs (1)

update_modpack (1)

Méthodes fréquemment utilisées

install_modpack (1)

insert_thumbs (1)

populate_tags (1)

resample (1)

query_video_information (1)

statistics (1)

url_valid (1)

url (1)

update_thumbs (1)

update_modpack (1)

to_decoder (1)

sync_topic_to_es (1)

start (1)

request (1)

setReferer (1)

setCredentials (1)

send (1)

search_movie (1)

runFetcher (1)

run (1)

get_subscriptions (1)

get_user_data (1)

get_latest_season (1)

get_ranked_pages (1)

coutries_data (1)

extract_dns (1)

entries (1)

download (1)

downLoadContent (1)

create_table (1)

Méthodes fréquemment utilisées

to_decoder (1)

sync_topic_to_es (1)

start (1)

request (1)

setReferer (1)

setCredentials (1)

send (1)

search_movie (1)

runFetcher (1)

run (1)

get_subscriptions (1)

get_user_data (1)

get_latest_season (1)

get_ranked_pages (1)

coutries_data (1)

extract_dns (1)

entries (1)

download (1)

downLoadContent (1)

create_table (1)

create_crits (1)

coutry_data (1)

country_historical_data (1)

fetchNew (1)

countries_historical_data (1)

copy (1)

child_urls (1)

all_movies (1)

add_url (1)

_fetch_url (1)

__init__ (1)

fetchAll (1)

fetchSelected (1)

get_playlist_data (1)

fetch_urls (1)

Processor (1)

get_dataframe (1)

get_channel_data (1)

getTime (1)

getCookies (1)

Exemple #1

0

Afficher le fichier

def _parse(self, enclosing_session=None, verbose=False): """ Parse the article content to yield parse trees and annotated token list """ with SessionContext(enclosing_session) as session: # Convert the content soup to a token iterable (generator) toklist = Fetcher.tokenize_html(self._url, self._html, session) bp = self.get_parser() ip = IncrementalParser(bp, toklist, verbose=verbose) # List of paragraphs containing a list of sentences containing token lists # for sentences in string dump format (1-based paragraph and sentence indices) pgs = [] # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() # Word stem dictionary, indexed by (stem, cat) words = defaultdict(int) num_sent = 0 for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): num_sent += 1 if sent.parse(): # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest( sent.tree) pgs[-1].append( Article._dump_tokens(sent.tokens, sent.tree, words)) else: # Error or no parse: add an error index entry for this sentence eix = sent.err_index trees[num_sent] = "E{0}".format(eix) pgs[-1].append( Article._dump_tokens(sent.tokens, None, None, eix)) parse_time = ip.parse_time self._parsed = datetime.utcnow() self._parser_version = bp.version self._num_tokens = ip.num_tokens self._num_sentences = ip.num_sentences self._num_parsed = ip.num_parsed self._ambiguity = ip.ambiguity # Make one big JSON string for the paragraphs, sentences and tokens self._raw_tokens = pgs self._tokens = json.dumps(pgs, separators=(',', ':'), ensure_ascii=False) self._words = words # self._tokens = "[" + ",\n".join("[" + ",\n".join(sent for sent in p) + "]" for p in pgs) + "]" # Create a tree representation string out of all the accumulated parse trees self._tree = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())

Exemple #2

0

Afficher le fichier

Fichier : article.py Projet : haukurb/Reynir

def _parse(self, enclosing_session=None, verbose=False): """ Parse the article content to yield parse trees and annotated token list """ with SessionContext(enclosing_session) as session: # Convert the content soup to a token iterable (generator) toklist = Fetcher.tokenize_html(self._url, self._html, session) bp = self.get_parser() ip = IncrementalParser(bp, toklist, verbose=verbose) # List of paragraphs containing a list of sentences containing token lists # for sentences in string dump format (1-based paragraph and sentence indices) pgs = [] # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() # Word stem dictionary, indexed by (stem, cat) words = defaultdict(int) num_sent = 0 for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) # We don't attempt to parse very long sentences (>100 tokens) # since they are memory intensive (>16 GB) and may take # minutest to process if num_tokens <= MAX_SENTENCE_TOKENS and sent.parse(): # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens( sent.tokens, sent.tree, words) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest( sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join([ "C{0}".format(sent.score), "L{0}".format(num_tokens), tree ]) else: # Error, sentence too long or no parse: # add an error index entry for this sentence if num_tokens > MAX_SENTENCE_TOKENS: # Set the error index at the first # token outside the maximum limit eix = MAX_SENTENCE_TOKENS else: eix = sent.err_index token_dicts = TreeUtility.dump_tokens( sent.tokens, None, None, eix) trees[num_sent] = "E{0}".format(eix) pgs[-1].append(token_dicts) # parse_time = ip.parse_time self._parsed = datetime.utcnow() self._parser_version = bp.version self._num_tokens = ip.num_tokens self._num_sentences = ip.num_sentences self._num_parsed = ip.num_parsed self._ambiguity = ip.ambiguity # Make one big JSON string for the paragraphs, sentences and tokens self._raw_tokens = pgs self._tokens = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False) # Keep the bag of words (stem, category, count for each word) self._words = words # Create a tree representation string out of all the accumulated parse trees self._tree = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())