Python clean_stringの例、util.clean_string Pythonの例

コード例 #1

0

ファイルを表示

    def _identify_exact_matches(self, classroom_id, first_name,
                                last_name, middle_initial):
        """Search for exact matches to identify students.

        "Exact" means match on first name, last name, birthday, and school.
        """
        stripped_first_name = util.clean_string(first_name)
        stripped_last_name = util.clean_string(last_name)
        if middle_initial is not None:
            middle_initial = util.clean_string(middle_initial).upper()

        # <issue #208>
        #   <remove later>
        # For the time being, it also means either regular or stripped versions
        # of names. In the future, we will only process stripped names.
        normal_q = self._base_identify_query(classroom_id)
        normal_q.filter('first_name =', first_name)
        normal_q.filter('last_name =', last_name)
        normal_q.filter('middle_initial =', middle_initial)
        #   </remove later>
        # </issue #208>

        # Query based on stripped names because we expect students to type
        # their name differently from session to session. Stripping attempts
        # to make their name uniform and still unique. See util.clean_string().
        stripped_q = self._base_identify_query(classroom_id)
        stripped_q.filter('stripped_first_name =', stripped_first_name)
        stripped_q.filter('stripped_last_name =', stripped_last_name)
        stripped_q.filter('middle_initial =', middle_initial)

        # <issue #208>
        #   <remove later>
        combined_results = normal_q.fetch(5) + stripped_q.fetch(5)
        unique_results = list(set(combined_results))
        return unique_results

コード例 #2

0

ファイルを表示

    def scrape_urls(self, response):
        #1. sort through data and extract urls
        #2. put urls together
        #3. Loop to each url, returning @parse
        base_url = "https://www.walmart.com"
        self.raw = response.body_as_unicode()
        #print("raw: " + self.raw)
        remove = ['{', '}', 'Link', ' ']
        self.cleaned = self.raw
        for char in remove:
            self.cleaned = self.cleaned.replace(char, '')
        self.comma_split = self.cleaned.split('","')
        #print ("cleaned - " + cleaned)
        #print ("comma_split - " )
        #print (*comma_split)
        self.colon_split = [entry.split('":"') for entry in self.comma_split]
        #inspect_response(response, self)
        self.colon_split[0].remove('"sections')
        #print ("colon_split - ")
        #print (*colon_split)
        self.urls = [entry[-1] for entry in self.colon_split]
        #print("urls - ")
        #print(self.urls)

        section = "unset"
        subsection = "unset"

        self.section_dict = {}
        chars_to_remove=["\'","&"]
        for entry in self.colon_split:

            # each entry will have a subheading (normally at 0 unless it has a heading entry)
            section = clean_string(entry[0],chars_to_remove)
            url_end = clean_string(entry[-1],"\"")

            # if its a section header it will contain 3 entries
            #   and all subsequent entries will have the same heading
            if len(entry) > 2:
                section = clean_string(entry[0],chars_to_remove)
                subsection = clean_string(entry[1],chars_to_remove)

            url = base_url + url_end
            category=lookup_category("",section,subsection)
            store_url(self.conn,url,self.store_id,category,section,subsection)
            #self.section_dict[url] = (self.section, self.subsection)

            #print(section, subsection, url)

        next_url=get_next_url(self.cursor, 1)
        if next_url is None:
            print("No more urls to parse finishing")
        else:
            yield SplashRequest(url,
                            self.parse,
                            endpoint='render.html',
                            args={
                                'wait': 10,
                                'section': section,
                                'subsection': subsection
                            })

コード例 #3

0

ファイルを表示

ファイル: meeting.py プロジェクト: tibvdm/Federal-Parliament-Scraper

        def get_name_and_electronic_votes():
            name_votes = {}
            electronic_votes = {}
            s3 = soup.find('div', {'class': 'Section3'})
            if s3:
                tags = s3.find_all(
                    text=re.compile(r'Vote\s*nominatif\s*-\s*Naamstemming:'))
                tags += s3.find_all(
                    text=re.compile(r'Naamstemming\s*-\s*Vote\s*nominatif:'))
                for i, tag in enumerate(tags):
                    vote_number = extract_vote_number_from_tag(tag, i)
                    vote_header = go_to_p(tag)
                    cancelled, current_node = is_vote_cancelled(vote_header)
                    if cancelled:
                        continue

                    yes, current_node = extract_name_list_from_under_table(
                        current_node.find_next_sibling())
                    no, current_node = extract_name_list_from_under_table(
                        current_node.find_next_sibling())

                    abstention = []

                    # Handles the case where the abstention box is missing (no abstentions)
                    if 'onthoudingen' in current_node.get_text().lower(
                    ) or 'abstentions' in current_node.get_text().lower():
                        next_vote = go_to_p(tags[
                            i + 1]).find_previous_sibling() if i + 1 < len(
                                tags) else vote_header.parent.find_all('p')[-1]
                        current_node = next_vote
                        abstention = clean_string(current_node.get_text())
                        current_node = current_node.find_previous_sibling()

                        # TODO: merge with function
                        while not (current_node.name == "table"
                                   or 'naamstemming'
                                   in current_node.get_text().lower()):
                            if current_node.get_text():
                                abstention = clean_string(
                                    current_node.get_text()) + ',' + abstention
                            current_node = current_node.find_previous_sibling()
                        abstention = clean_list(abstention.split(','))

                    name_votes[vote_number] = (yes, no, abstention)

                tags = s3.find_all(text=re.compile(
                    r'Comptage\s*électronique\s*–\s*Elektronische telling:'))
                for i, tag in enumerate(tags):
                    vote_number = extract_vote_number_from_tag(tag, i)
                    vote_header = go_to_p(tag)
                    cancelled, current_node = is_vote_cancelled(vote_header)

                    if cancelled:
                        continue

                    electronic_votes[vote_number] = current_node

            return name_votes, electronic_votes

コード例 #4

0

ファイルを表示

ファイル: meeting.py プロジェクト: tibvdm/Federal-Parliament-Scraper

        def extract_name_list_from_under_table(current_node):
            name_list = clean_string(current_node.get_text())
            while not (current_node.name == "table"
                       or 'naamstemming' in current_node.get_text().lower()):
                if current_node.get_text():
                    name_list += ',' + clean_string(current_node.get_text())
                current_node = current_node.find_next_sibling()

            name_list = clean_list(name_list.split(','))
            return name_list, current_node

コード例 #5

0

ファイルを表示

ファイル: meeting.py プロジェクト: tibvdm/Federal-Parliament-Scraper

        def extract_title_by_vote(table: NavigableString, language: Language):
            class_name = Meeting.language_mapping[language][1]

            next_line = table.find_previous_sibling("p", {"class": class_name})
            while not re.match(r"([0-9]+) (.)*", clean_string(next_line.text)):
                next_line = next_line.find_previous_sibling(
                    "p", {"class": class_name})

            match = re.match(r"([0-9]+) (.*)", clean_string(next_line.text))
            return int(match.group(1))

コード例 #6

0

ファイルを表示

def check_top_achat(urls):
    out_results = []
    for url in urls:
        tree = util.get_tree(url)

        nb_resultats = tree.xpath(
            '//*[@id="content"]/nav[1]/ul/li[4]/text()')[0]
        nb = util.make_num(nb_resultats)
        results = []

        liste_prix_ = tree.xpath(
            "//section[@class = 'produits list']//div[@itemprop= 'price']/text()"
        )
        liste_titres = tree.xpath(
            "//section[@class = 'produits list']//div[@class = 'libelle']/a/h3/text()"
        )
        liste_dispos = tree.xpath(
            "//section[@class = 'produits list']//section[last()]/@class")

        for i in range(0, int(nb)):
            prix_ = liste_prix_[i][0:-4]
            prix = util.make_num(prix_)
            if (int(prix) >= 850):
                continue

            titre = liste_titres[i]
            geforce_ad = " + 1 an d'abonnement GeForce Now offert ! ".lower()
            call_of_ad = "+ Call of Duty: Black Ops Cold War offert ! ".lower()
            if ('water' in titre.lower() or 'hydro' in titre.lower()):
                continue
            elif (geforce_ad in titre.lower()):
                titre = titre[0:len(titre) - len(geforce_ad)]
            elif (call_of_ad in titre.lower()):
                titre = titre[0:len(titre) - len(call_of_ad)]

            raw_dispo = liste_dispos[i]
            dispo = ""
            if (raw_dispo == 'en-rupture'):
                dispo = 'Rupture'
            elif (raw_dispo == 'dispo-sous-7-jours'):
                dispo = 'sous 7 jours'
            elif (raw_dispo == 'dispo-entre-7-15-jours'):
                dispo = 'entre 7-15 jours'
            elif (raw_dispo == 'dispo-plus-15-jours'):
                dispo = '+ de 15 jours'
            else:
                dispo = raw_dispo

            results.append(
                ('topachat.com         ' + util.clean_string(titre), dispo,
                 util.clean_string(prix)))
        out_results += results

    return out_results

コード例 #7

0

ファイルを表示

def check_pc_componentes(urls):
    out_results = []
    for url in urls:
        tree = util.get_tree(url)

        titres = tree.xpath(
            f"//div[@class = 'c-product-card__content']/header/h3/a/text()")
        prixs = tree.xpath(
            f"//div[@class = 'c-product-card__content']/div[2]/div/span/text()"
        )
        dispos = tree.xpath(
            f"//div[@class = 'c-product-card__content']/div[3]/text()")

        results = []
        for titre, prix, dispo in zip(titres, prixs, dispos):
            if (',' in prix):
                prix = util.make_num(prix[0:-4])
            else:
                prix = util.make_num(prix)

            if (int(prix) >= 850):
                continue

            if 'rtx' not in titre.lower():
                continue

            avoid_bool = False
            avoid_words = [
                'reacondicionado', 'recondicionado', 'water', 'hydro', 'ekwb',
                'intel', 'ryzen', '2080', '2070', 'i7', 'i5', 'Vector'
            ]
            for a in avoid_words:
                if a in util.clean_string(titre.lower()):
                    avoid_bool = True
                    break

            if avoid_bool:
                continue

            if (util.clean_string(dispo).lower() == "sin fecha de entrada"):
                dispo = "Rupture"
            else:
                dispo = "Check dispo"

            results.append(
                ('pccomponentes.com    ' + util.clean_string(titre), dispo,
                 util.clean_string(prix)))

        out_results += results
    return out_results

コード例 #8

0

ファイルを表示

def ldlc_targeted(url):
    tree = util.get_tree(url)

    name = tree.xpath("/html/body/div[3]/div[2]/div[1]/h1/text()")[0]
    dispo = tree.xpath(
        "/html/body/div[3]/div[2]/div[2]/div[3]/aside/div[4]/div[1]/div[2]/div/span/text()"
    )[0]
    prix_ = tree.xpath(
        "/html/body/div[3]/div[2]/div[2]/div[3]/aside/div[1]/div/text()"
    )[0][0:-1]

    prix = util.make_num(prix_)
    return (util.clean_string(name), util.clean_string(dispo),
            util.clean_string(prix))

コード例 #9

0

ファイルを表示

def counter_processing( dataframe, is_zip_root, sanitize_dev=False ):
	word_set_diff = set()  # Used for storing words/phrases to later be similarilty checked
	d = dataframe

	# Dict to aggregate counts
	nested_dict = defaultdict(Counter)

	cross_ref_fix = 0  # Option 2 related only, cross-ref counter

	# Primary loop to locate, and increment. Cleanse functions are used here.
	for row in range(len(d)):

		# Yank
		zip = d.loc[row, 'incident_zip']
		complaint = d.loc[row, 'complaint_type']
		borough = d.loc[row, 'borough']

		# Clean
		zip = verify_clean_zip(zip)
		complaint = clean_string(complaint)
		borough = clean_string(borough)

		# Dev - String Similarity sets
		word_set_diff.add(borough)
		word_set_diff.add(complaint)

		print(borough, " - ", complaint, " - ", zip)  # Raw print as rows iterate

		if (is_zip_root):  # Option 1 at menu (Zip is parent/root)
			nested_dict[zip][complaint] += 1
		elif ('unspecified' not in borough):  # Option 2 always - with unspecified check
			nested_dict[borough][complaint] += 1
		elif ("unspecified" in borough and zip is not None):  # Option 2 but bad borough string
			print(borough, zip)
			if attempt_borough_from_zip(zip):
				borough = attempt_borough_from_zip(zip)  # Attempting cross reference to find borough
				if borough:
					cross_ref_fix += 1
					nested_dict[clean_string(borough)][complaint] += 1  # Success on cross reference!
			else:
				print("No Cross Reference Found")

	print("\n" * 5, " -------- \n")
	pprint(dict(nested_dict))  # Print out final structure
	if (not is_zip_root):
		print("FIXED CROSS REFERENCED:", cross_ref_fix)
	# Kicks off fuzzy-wuzzy checking (Option 5)
	if sanitize_dev:
		print("\n\n-- FUZZY CHECKING --")
		handle_similarity_debug(word_set_diff)

コード例 #10

0

ファイルを表示

ファイル: test_util.py プロジェクト: Stanford-PERTS/yosemite

    def test_clean_string(self):
        """Test that clean_string() returns only lowercase a-z of type str."""
        strings_to_clean = [
            u'Nicholas',
            u'Nicolás',
            u'N1colas',
            u'N#$colas',
            u'Nichol"a"s',
            u'Nich\olas',
            '12345',  # Some schools want to use ids rather than last names
            'Nicholas',
            'N1colas',
            'N#$colas',
            'Nichol"a"s',
            "Nich\olas",
            # This guy *shouldn't* fail, but it won't return what we want it to
            # (This isn't a problem right now, because the front end is serving
            # us unicode objects, not strs.):
            'Nicolás',
        ]

        for index, test_string in enumerate(strings_to_clean):
            # Nothing but lowercase alphabetic characters and digits,
            # beginning to end.
            pattern = r'^[a-z0-9]+$'
            cleaned_string = util.clean_string(test_string)
            # re.match() will return None if the pattern doesn't match.
            self.assertIsNotNone(re.match(pattern, cleaned_string),
                                 'string index: {}'.format(index))

            # output must always be a string (not unicode)
            self.assertIsInstance(cleaned_string, str)

コード例 #11

0

ファイルを表示

    def _identify_partial_matches(self, classroom_id, last_name):
        """Search for partial matches to identify students.

        "Partial" means we don't use first name.
        """
        stripped_last_name = util.clean_string(last_name)

        # <issue #208>
        #   <remove later>
        # For the time being, it also means either regular or stripped versions
        # of names. In the future, we will only process stripped names.
        normal_q = self._base_identify_query(classroom_id)
        normal_q.filter('last_name =', last_name)
        #   </remove later>
        # </issue #208>

        # Query based on stripped names because we expect students to type
        # their name differently from session to session. Stripping attempts
        # to make their name uniform and still unique. See util.clean_string().
        stripped_q = self._base_identify_query(classroom_id)
        stripped_q.filter('stripped_last_name =', stripped_last_name)

        # <issue #208>
        #   <remove later>
        combined_results = normal_q.fetch(5) + stripped_q.fetch(5)
        unique_results = list(set(combined_results))
        return unique_results

コード例 #12

0

ファイルを表示

    def _identify_partial_matches(self, cohort_id, last_name):
        """Search for partial matches to identify students.

        Pulls data from a special set of memcache keys, which are updated by
        cron, and provide the names of all students in the school. All the
        names are examined to see if the typed name is contained in or
        contained by the existing name ("containment matching"), which are
        considered partial matches. Then the matches are ordered by their
        similarity (Levenshtein distance) to the typed name.
        """
        stripped_last_name = util.clean_string(last_name)

        match_data, from_memcache = self.internal_api.get_roster(cohort_id)

        # White list necessary properties (no sense in releasing status codes
        # like 'Parent Refusal' to the public).
        def clean_properties(d):
            white_list = [
                'first_name', 'last_name', 'classroom_name', 'id',
                'stripped_last_name'
            ]
            return {k: v for k, v in d.items() if k in white_list}

        # Containment matching.
        matches = [
            clean_properties(u) for u in match_data
            if u['stripped_last_name'] in stripped_last_name
            or stripped_last_name in u['stripped_last_name']
        ]

        # Order by edit (Levenshtein) distance from the submitted name.
        sort_func = lambda n: util.levenshtein_distance(
            n['stripped_last_name'], stripped_last_name)
        return sorted(matches, key=sort_func)

コード例 #13

0

ファイルを表示

def check_ldlc(urls):
    out_results = []
    for url in urls:
        tree = util.get_tree(url)

        nb_resultats = tree.xpath(
            '/html/body/div[3]/div/div[3]/div[1]/div/div[2]/div[1]/div[1]/text()'
        )[0]
        nb = util.make_num(nb_resultats)

        #48 is the maximum of items in a page
        if int(nb) > 48:
            nb = 48

        results = []

        for i in range(1, int(nb) + 1):
            prix_ = tree.xpath(
                f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[4]/div[1]/div/text()"
            )[0]
            prix = util.make_num(prix_)
            if (int(prix) >= 850):
                continue

            titre = tree.xpath(
                f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[1]/div[1]/h3/a/text()"
            )[0]
            if ('water' in titre.lower() or 'hydro' in titre.lower()):
                continue

            dispo = tree.xpath(
                f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[3]/div/div[2]/div/span/text()"
            )[0]

            dispo_p2 = tree.xpath(
                f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[3]/div/div[2]/div/span/em/text()"
            )
            if len(dispo_p2) >= 1:
                dispo = dispo + ' ' + dispo_p2[0]

            results.append(('LDLC.com             ' + util.clean_string(titre),
                            util.clean_string(dispo), util.clean_string(prix)))

        out_results += results

    return out_results

コード例 #14

0

ファイルを表示

ファイル: vote.py プロジェクト: tibvdm/Federal-Parliament-Scraper

    def from_table(meeting_topic, vote_number: int, vote_rows: NavigableString):
        """Generate a new Vote from a parsed table.

        Args:
            vote_number (int): Number of the vote in this meeting (e.g. 1)
            vote_rows (NavigableString): Vote rows as obtained by BeautifulSoup

        Returns:
            Vote: 
        """
        yes = int(clean_string(vote_rows[1].find_all(
            'td')[1].find('p').get_text()))
        no = int(clean_string(vote_rows[2].find_all(
            'td')[1].find('p').get_text()))
        abstention = int(clean_string(
            vote_rows[3].find_all('td')[1].find('p').get_text()))

        return GenericVote(meeting_topic, vote_number, yes, no, abstention)

コード例 #15

0

ファイルを表示

ファイル: meeting.py プロジェクト: tibvdm/Federal-Parliament-Scraper

            def parse_topics(language):
                classes = Meeting.language_mapping[language]
                titles = soup.find_all('p', {'class': classes[1]})
                current_title = ""

                while titles:
                    item = titles.pop()
                    if not clean_string(item.text):
                        continue
                    while not re.match("([0-9]+) (.*)", clean_string(
                            item.text)):
                        current_title = clean_string(
                            item.text) + '\n' + current_title
                        item = titles.pop()
                    m = re.match("([0-9]+) (.*)", clean_string(item.text))

                    current_title = m.group(2) + '\n' + current_title
                    section = item.find_previous_sibling(
                        "p", {"class": classes[0]})

                    item = int(m.group(1))
                    if not item in self.topics:
                        self.topics[item] = MeetingTopic(
                            self.parliamentary_session, self, item)
                    self.topics[item].set_title(language,
                                                current_title.rstrip())
                    self.topics[item].set_section(
                        language,
                        clean_string(section.text) if section else
                        ("Algemeen"
                         if language == Language.NL else "Generale"))
                    self.topics[item].complete_type()
                    if language == Language.NL:
                        title = normalize_str(
                            current_title.rstrip().lower()).decode()
                        for member in self.parliamentary_session.get_members():
                            if member.normalized_name() in title:
                                member.post_activity(
                                    TopicActivity(member, self,
                                                  self.topics[item]))
                    current_title = ""

コード例 #16

0

ファイルを表示

    def _identify_exact_matches(self, classroom_id, first_name, last_name):
        """Search for exact matches to identify students.

        "Exact" means match on first name, last name, and classroom.
        """
        stripped_first_name = util.clean_string(first_name)
        stripped_last_name = util.clean_string(last_name)

        logging.info(
            "Querying for exact match on is_test: False, user_type: student, "
            "classroom: {}, stripped_first_name: {}, stripped_last_name: {}".
            format(classroom_id, stripped_first_name, stripped_last_name))

        # Query based on stripped names because we expect students to type
        # their name differently from session to session. Stripping attempts
        # to make their name uniform and still unique. See util.clean_string().
        stripped_q = self._base_identify_query(classroom_id)
        stripped_q.filter('stripped_first_name =', stripped_first_name)
        stripped_q.filter('stripped_last_name =', stripped_last_name)

        return stripped_q.fetch(5)

コード例 #17

0

ファイルを表示

ファイル: vote.py プロジェクト: tibvdm/Federal-Parliament-Scraper

    def from_table(meeting_topic, vote_number: int, vote_rows: NavigableString):
        """Generate a new Vote from a parsed table.

        Args:
            meeting_topic (MeetingTopic): The meeting topic
            vote_number (int): Number of the vote in this meeting (e.g. 1)
            vote_rows (NavigableString): Vote rows as obtained by BeautifulSoup

        Returns:
            Vote: 
        """
        yes_fr = int(clean_string(
            vote_rows[2].find_all('td')[1].find('p').get_text()))
        no_fr = int(clean_string(
            vote_rows[3].find_all('td')[1].find('p').get_text()))
        abstention_fr = int(clean_string(
            vote_rows[4].find_all('td')[1].find('p').get_text()))

        yes_nl = int(clean_string(
            vote_rows[2].find_all('td')[3].find('p').get_text()))
        no_nl = int(clean_string(
            vote_rows[3].find_all('td')[3].find('p').get_text()))
        abstention_nl = int(clean_string(
            vote_rows[4].find_all('td')[3].find('p').get_text()))

        return LanguageGroupVote(meeting_topic, vote_number, GenericVote(meeting_topic, vote_number, yes_nl, no_nl, abstention_nl), GenericVote(meeting_topic, vote_number, yes_fr, no_fr, abstention_fr))

コード例 #18

0

ファイルを表示

    def parse(self, response):
        page_1_str=self.page_str+"1"
        this_url = trim_url(response.url,page_1_str)
        print (f"inside parse for {this_url}")
        self.scrape_urls(response)

        # Only scrape pages that have the page_str in the url.
        if this_url.find(self.page_str) != -1:
            print (f"scraping for {this_url}")
            items = response.css('product-item-v2')
            print(f"length of items - {len(items)}")
            metadata=get_url_metadata(self.cursor,this_url)
            section=metadata[1]
            subsection=metadata[2]
            for item in items:
                name = item.css('.product-title ::text').get()
                price_strings = item.css('.product-price ::text').getall()
                price = clean_string(price_strings[-1],['$'])
                ppu = item.css('.product-price-qty ::text').get()
                unit = self.collect_units(name)
                #inspect_response(response,self)

                if unit == "OZ" or unit == "LB":
                    ounces = self.collect_ounces(name)
                else:
                    ounces = 0
                print (f"yielding - {name}, {price}, {ppu}, {ounces}, {unit}")
                yield{
                  "name": name,
                  "price": price,
                  "ounces": ounces,
                  "unit": unit,
                  "price-per-unit": ppu,
                  "url": this_url,
                  "section": section,
                  "subsection": subsection
                }

        #Basically the website redirects us to the url and page_1_str, which isn't added to our database
        # So we trim that off so we can get the url in our database
        finish_url(self.conn,self.store_id,this_url)
        print("finishing url - " + this_url)
        next_url = get_next_url(self.cursor, 1)
        if next_url is None:
            print ("Next url is none therefore we must be finished ! ")
            return
        else:
            next_request = create_parse_request(next_url,
                                                self.check_location,
                                                EC.element_to_be_clickable((By.CSS_SELECTOR,'#openFulfillmentModalButton')))
        print(f"got next_url - {next_url}")
        yield next_request

コード例 #19

0

ファイルを表示

ファイル: vote.py プロジェクト: tibvdm/Federal-Parliament-Scraper

def electronic_vote_from_table(meeting_topic, vote_number: int, vote_start_node: NavigableString):
    """Generate a new electronic (advisory or generic) vote from a parsed table.

    Args:
        meeting_topic (MeetingTopic): The meeting topic
        vote_number (int): Number of the vote in this meeting (e.g. 1)
        vote_start_node (NavigableString): Vote start node as obtained by BeautifulSoup

    Returns:
        Vote: 
    """

    yes = int(clean_string(vote_start_node.find_all(
        'td')[1].find('p').get_text()))
    vote_end_node = vote_start_node.find_next_sibling().find_next_sibling()
    if not vote_end_node or vote_end_node.name != 'table':
        return ElectronicAdvisoryVote(meeting_topic, vote_number, yes)

    no = int(clean_string(vote_end_node.find_all(
        'td')[1].find('p').get_text()))

    return ElectronicGenericVote(meeting_topic, vote_number, yes, no)

コード例 #20

0

ファイルを表示

ファイル: Archiver.py プロジェクト: jdigilio/epicsarchiver

    def update_value(self,name,ts,val):
        "insert value into appropriate table " 
        if val is None: return
        if ts is None or ts < self.MIN_TIME: ts = time.time()
       
        self.pvinfo[name]['last_ts'] =  ts
        self.pvinfo[name]['last_value'] =  val

        info = self.pvinfo[name]
        try:
            self.db.execute(self.sql_insert % (info['data_table'],info['id'], ts,clean_string(val)))
        except TypeError:
            self.write("cannot update %s\n" % name)

コード例 #21

0

ファイルを表示

ファイル: vote.py プロジェクト: Parliament-in-Data/Federal-Parliament-Scraper

    def from_table(meeting_topic, vote_number: int, vote_rows: NavigableString):
        """Generate a new Vote from a parsed table.

        Args:
            vote_number (int): Number of the vote in this meeting (e.g. 1)
            vote_rows (NavigableString): Vote rows as obtained by BeautifulSoup

        Returns:
            Vote: 
        """
        yes_str = clean_string(vote_rows[1].find_all(
            'td')[1].find('p').get_text())
        if not yes_str:
            # Sometimes, tables are empty... example: https://www.dekamer.be/doc/PCRI/html/55/ip100x.html
            return None
        yes = int(yes_str)
        no = int(clean_string(vote_rows[2].find_all(
            'td')[1].find('p').get_text()))
        abstention = int(clean_string(
            vote_rows[3].find_all('td')[1].find('p').get_text()))

        return GenericVote(meeting_topic, vote_number, yes, no, abstention)

コード例 #22

0

ファイルを表示

def check_materiel(url_list, web_driver):
    output_results = []
    for url in url_list:
        web_driver.get(url)

        nb_resultats = web_driver.find_element_by_xpath(
            '//*[@id="tabProducts"]').text
        nb = util.make_num(nb_resultats)

        if int(nb) > 48:
            nb = 48

        results = []
        for i in range(1, int(nb) + 1):
            prix_ = web_driver.find_element_by_xpath(
                f"//*[@data-position = '{i}']/div[4]/div[1]/span").text[0:-2]
            prix = util.make_num(prix_)
            if (int(prix) >= 850):
                continue

            titre = web_driver.find_element_by_xpath(
                f"//*[@data-position = '{i}']/div[2]/a/h2").text
            if ('water' in titre.lower() or 'hydro' in titre.lower()):
                continue

            dispo = web_driver.find_element_by_xpath(
                f"//*[@data-position = '{i}']/div[3]/div/span[2]").text

            if dispo == 'RUPTURE':
                dispo = "Rupture"

            results.append(('Materiel.net         ' + util.clean_string(titre),
                            util.clean_string(dispo), util.clean_string(prix)))

        output_results += results

    return output_results

コード例 #23

0

ファイルを表示

ファイル: urlScraper.py プロジェクト: gobfink/Groceries

 def get_quantity(self):
     quantity_selector = (
         "body > app-root > div > hts-layout > span > hts-shop-by-category > div > "
         "section > div > div.product-category-list.col-lg-7.col-md-9.column7 >  "
         "div.smart-filter.clearfix > h2 > span")
     ret = 0
     try:
         quantity = self.driver.find_element_by_css_selector(
             quantity_selector).text
         quantity = clean_string(quantity, ['(', ')'])
         if ret is None:
             ret = 0
         ret = int(quantity)
     except NoSuchElementException:
         ret = 0
     print(f"in get_quantity - found quantity of {ret}")
     return ret

コード例 #24

0

ファイルを表示

ファイル: groceryScraper.py プロジェクト: gobfink/Groceries

    def parse(self, response):
        self.driver = response.request.meta['driver']
        close_modal(self)
        change_store_location(self)

        url = response.url
        metadata = get_url_metadata(self.cursor, url)
        section = metadata[1]
        subsection = metadata[2]
        #check if it has a next button,
        items = response.css('.cell-content-wrapper')
        for item in items:
            name = item.css('.cell-title-text ::text').get()
            name = clean_string(name, ['\"'])
            price = item.css('[data-test="amount"] .css-19m8h51 ::text').get()
            price = convert_dollars(price)

            quantity = item.css('[data-test="amount"] .css-cpy6p ::text').get()

            unit = item.css('.cell-product-size ::text').get()
            ounces = convert_to_ounces(unit)

            ppu = item.css('[data-test="per-unit-price"] ::text').get()
            ppu = convert_ppu(ppu)

            self.logger.info(
                f"name - {name}, price - {price}, quantity - {quantity}, ounces - {ounces}, ppu - {ppu}, url - {url}, section - {section}, subsection - {subsection} "
            )
            #inspect_response(response,self)
            yield {
                "name": name,
                "price": price,
                "ounces": ounces,
                "unit": unit,
                "price-per-unit": ppu,
                "url": url,
                "section": section,
                "subsection": subsection
            }

        finish_url(self.conn, self.store_id, url)

        request = self.get_next_request()
        yield request

コード例 #25

0

ファイルを表示

 def collect_ounces(self,string):
     split = string.split(' - ')
     ounces = 0
     
     if len(split) == 1:
         print (f"No -'s found in {string} - not updating ounces")
     elif len(split) == 2:
         weight = split[1]
         ounces = convert_to_ounces(weight)
     elif len(split) == 3:
         quantity = split[1]
         weight = convert_to_ounces(split[2])
         quantity = clean_string(quantity,["Count"])
         if quantity.isdigit():
             quantity=int(quantity)
         else:
             quantity=1
         ounces = weight * quantity
     else:
         print(f"Collect_ounces too many '-'s in string {string}")
     return ounces

コード例 #26

0

ファイルを表示

ファイル: brazil.py プロジェクト: fabiomdiniz/verdandi

def get_market():
    #util.clean_market(0)
    url = 'http://pregao-online.bmfbovespa.com.br/Cotacoes.aspx'
    soup = BeautifulSoup(urlfetch.fetch(url, deadline=50).content, 'lxml')
    rate = util.get_exchange()
    #dt = get_datetime()
    dt = datetime.datetime.now(tz.tzstr('EBST3EBDT'))

    market = Market(ref=0, date=dt.date(), time=dt.time(), exchange_rate=rate)
    market.put()

    table = soup('table', attrs={'id': 'ctl00_DefaultContent_GrdCarteiraIndice'})[0]
    for tr in table('tr')[1:]:
        tds = tr('td')
        code = str(tds[0].string)
        name = util.clean_string(tds[1].string)
        value = util.get_float(tds[2].string)
        diff = util.get_float(tds[3].text.strip())
        stock = Stock(name=util.get_or_create_name(0, code, name),
                      value=value, diff=diff, market=market.key())
        stock.put()

コード例 #27

0

ファイルを表示

ファイル: html_anken_disp.py プロジェクト: hzd02765/nyusatsu_check_cron

	def get_anken(self):
		fp = urllib2.urlopen(self.url)
		html = fp.read()
		fp.close()

		html = unicode(html, 'euc_jp', 'ignore')
		self.html = util.clean_string(html)

		# print(self.html)

		self.anken = dao_anken.ClassAnken()

		self.anken.nyusatsu_system = 1
		self.anken.nyusatsu_type = 1
		self.anken.anken_url = self.url
		self.anken.keishu_cd = self.keishu_cd
		self.anken.public_flag = self.public_flag

		self.anken.anken_no = self.get_anken_no()
		self.anken.anken_name = self.get_anken_name()
		self.anken.keishu_name = self.get_keishu_name()
		self.anken.company_area = self.get_company_area()
		self.anken.anken_open_date = self.get_anken_open_date()
		self.anken.anken_close_date = self.get_anken_close_date()
		self.anken.tender_date = self.get_tender_date()
		self.anken.tender_place = self.get_tender_place()
		self.anken.limit_date = self.get_limit_date()
		self.anken.gyoumu_kbn_1 = self.get_gyoumu_kbn_1()
		self.anken.gyoumu_kbn_2 = self.get_gyoumu_kbn_2()
		self.anken.kasitu_name = self.get_kasitu_name()
		self.anken.tanto_name = self.get_tanto_name()
		self.anken.notes = self.get_notes()
		self.anken.result_open_date = self.get_result_open_date()
		self.anken.result_close_date = self.get_result_close_date()
		self.anken.raku_name = self.get_raku_name()
		self.anken.price = self.get_price()
		self.anken.attached_file_1 = self.get_attached_file_1()
		self.anken.attached_file_2 = self.get_attached_file_2()
		self.anken.attached_file_3 = self.get_attached_file_3()

コード例 #28

0

ファイルを表示

def check_nvidia(url, web_driver):
    web_driver.get(url)
    num = int(
        util.make_num(
            web_driver.find_element_by_xpath(
                '/html/body/app-root/product/div[1]/div[1]/div[2]/div/suggested-product/div/div'
            ).text))
    results = []
    name = web_driver.find_element_by_xpath(
        '//featured-product/div/div/div[2]/div[2]/h2').text
    dispo = web_driver.find_element_by_xpath(
        '//featured-product/div/div/div[2]/div[3]/div[1]/div[2]/a').text
    prix = util.make_num(
        web_driver.find_element_by_xpath(
            '//featured-product/div/div/div[2]/div[3]/div[1]/div[1]/div/span[1]'
        ).text)

    if dispo == "RUPTURE DE STOCK":
        dispo = "Rupture"

    results.append(
        ("FE    " + util.clean_string(name), util.clean_string(dispo),
         util.clean_string(prix)))

    if num == None:
        num = 2

    for i in range(1, num):
        name = web_driver.find_element_by_xpath(
            f'//*[@id="resultsDiv"]/div/div[{i}]/div[2]/h2').text
        dispo = web_driver.find_element_by_xpath(
            f'//*[@id="resultsDiv"]/div/div[{i}]/div[3]/div[2]/div[2]/a').text
        prix = util.make_num(
            web_driver.find_element_by_xpath(
                f'//*[@id="resultsDiv"]/div/div[{i}]/div[3]/div[2]/div[1]/div/span[1]'
            ).text)

        if dispo == "RUPTURE DE STOCK":
            dispo = "Rupture"

        results.append(("FE                   " + util.clean_string(name),
                        util.clean_string(dispo), util.clean_string(prix)))

    return results

コード例 #29

0

ファイルを表示

ファイル: dao_j_histories.py プロジェクト: hzd02765/nyusatsu_check_cron

	def get_sql(self):
		return  util.clean_string(self.sql)

コード例 #30

0

ファイルを表示

ファイル: SimpleDB.py プロジェクト: jdigilio/epicsarchiver

 def clean_string(self, s):    return clean_string(s)
 def string_literal(self, s):  return string_literal(s)

コード例 #31

0

ファイルを表示

ファイル: data_loader.py プロジェクト: leo-lp/neon-1

def load_data(path,
              file_ext=['txt'],
              valid_split=None,
              vocab_file_name=None,
              max_vocab_size=None,
              max_len_w=None,
              output_path=None,
              subset_pct=100):
    """
    Given a path where data are saved, look for the ones with the right extensions
    If a split factor is given, it will split all the files into training and valid
    set. Then build vocabulary from the training and validation sets.

    Arguments:
        path: which directory to look for all the documents
        file_ext: what extension of the files to look for
        valid_split: to split the data into train/valid set. If None, no split
        vocab_file_name: optional file name. If None, the script will decide a name
                         given path and split
        max_vocab_size: maximum number of words to use in vocabulary (by most frequent)
        max_len_w: maximum length of sentences in words
        output_path: path used to save preprocessed data and resuts
        subset_pct: subset of dataset to load into H5 file (percentage)

    Returns:
        The function saves 2 files:
        h5 file with preprocessed data
        vocabulary file with: vocab, reverse_vocab, word_count
    """
    file_names = get_file_list(path, file_ext)

    file_str = get_file_str(path,
                            len(file_names),
                            labelled=False,
                            valid_split=valid_split,
                            subset_pct=subset_pct)

    # create output dir if needed
    if not os.path.isdir(output_path):
        os.makedirs(output_path)

    # file name to store the vocabulary
    if vocab_file_name is None:
        vocab_file_name = file_str + '.vocab'
        vocab_file_name = os.path.join(output_path, vocab_file_name)

    # If max sizes arent set, assume no limit
    if not max_len_w:
        max_len_w = sys.maxsize
    if not max_vocab_size:
        max_vocab_size = sys.maxsize

    # file name to store the pre-processed train/valid dataset
    h5_file_name = os.path.join(output_path, file_str + '.h5')

    if os.path.exists(h5_file_name) and os.path.exists(vocab_file_name):
        neon_logger.display(
            "dataset files {} and vocabulary file {} already exist. "
            "will use cached data. ".format(h5_file_name, vocab_file_name))
        return h5_file_name, vocab_file_name

    # split into training/valid set
    if valid_split is not None:
        if 'json' in file_ext:
            # Split based on number of files
            train_split = int(np.ceil(len(file_names) * (1 - valid_split)))
            train_files = file_names[:train_split]
            valid_files = file_names[train_split:]

            train_sent = load_json_sent(train_files, subset_pct)
            valid_sent = load_json_sent(valid_files, subset_pct)
            all_sent = train_sent + valid_sent
        elif 'txt' in file_ext:
            # Split based on number of lines (since only 2 files)
            all_sent = load_txt_sent(file_names, subset_pct)
            train_split = int(np.ceil(len(all_sent) * (1 - valid_split)))

            train_sent = all_sent[:train_split]
            valid_sent = all_sent[train_split:]
        else:
            neon_logger.display(
                "Unsure how to load file_ext {}, please use 'json' or 'txt'.".
                format(file_ext))
    else:
        train_files = file_names
        if 'json' in file_ext:
            train_sent = load_json_sent(train_files, subset_pct)
        elif 'txt' in file_ext:
            train_sent = load_txt_sent(train_files, subset_pct)
        else:
            neon_logger.display(
                "Unsure how to load file_ext {}, please use 'json' or 'txt'.".
                format(file_ext))
        all_sent = train_sent

    if os.path.exists(vocab_file_name):
        neon_logger.display(
            "open existing vocab file: {}".format(vocab_file_name))
        vocab, rev_vocab, word_count = load_obj(vocab_file_name)
    else:
        neon_logger.display("Building  vocab file")

        # build vocab
        word_count = defaultdict(int)
        for sent in all_sent:
            sent_words = tokenize(sent)

            if len(sent_words) > max_len_w or len(sent_words) == 0:
                continue

            for word in sent_words:
                word_count[word] += 1

        # sort the word_count , re-assign ids by its frequency. Useful for downstream tasks
        # only done for train vocab
        vocab_sorted = sorted(word_count.items(),
                              key=lambda kv: kv[1],
                              reverse=True)

        vocab = OrderedDict()

        # get word count as array in same ordering as vocab (but with maximum length)
        word_count_ = np.zeros((len(word_count), ), dtype=np.int64)
        for i, t in enumerate(list(zip(*vocab_sorted))[0][:max_vocab_size]):
            word_count_[i] = word_count[t]
            vocab[t] = i
        word_count = word_count_

        # generate the reverse vocab
        rev_vocab = dict((wrd_id, wrd) for wrd, wrd_id in vocab.items())

        neon_logger.display("vocabulary from {} is saved into {}".format(
            path, vocab_file_name))
        save_obj((vocab, rev_vocab, word_count), vocab_file_name)

    vocab_size = len(vocab)
    neon_logger.display(
        "\nVocab size from the dataset is: {}".format(vocab_size))

    neon_logger.display(
        "\nProcessing and saving training data into {}".format(h5_file_name))

    # now process and save the train/valid data
    h5f = h5py.File(h5_file_name, 'w', libver='latest')
    shape, maxshape = (len(train_sent), ), (None)
    dt = np.dtype([('text', h5py.special_dtype(vlen=str)),
                   ('num_words', np.uint16)])
    report_text_train = h5f.create_dataset('report_train',
                                           shape=shape,
                                           maxshape=maxshape,
                                           dtype=dt,
                                           compression='gzip')
    report_train = h5f.create_dataset('train',
                                      shape=shape,
                                      maxshape=maxshape,
                                      dtype=h5py.special_dtype(vlen=np.int32),
                                      compression='gzip')

    # map text to integers
    wdata = np.zeros((1, ), dtype=dt)
    ntrain = 0
    for sent in train_sent:
        text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)]

        # enforce maximum sentence length
        if len(text_int) > max_len_w or len(text_int) == 0:
            continue

        report_train[ntrain] = text_int

        wdata['text'] = clean_string(sent)
        wdata['num_words'] = len(text_int)
        report_text_train[ntrain] = wdata
        ntrain += 1

    report_train.attrs['nsample'] = ntrain
    report_train.attrs['vocab_size'] = vocab_size
    report_text_train.attrs['nsample'] = ntrain
    report_text_train.attrs['vocab_size'] = vocab_size

    if valid_split:
        neon_logger.display(
            "\nProcessing and saving validation data into {}".format(
                h5_file_name))
        shape = (len(valid_sent), )
        report_text_valid = h5f.create_dataset('report_valid',
                                               shape=shape,
                                               maxshape=maxshape,
                                               dtype=dt,
                                               compression='gzip')
        report_valid = h5f.create_dataset(
            'valid',
            shape=shape,
            maxshape=maxshape,
            dtype=h5py.special_dtype(vlen=np.int32),
            compression='gzip')
        nvalid = 0
        for sent in valid_sent:
            text_int = [
                -1 if t not in vocab else vocab[t] for t in tokenize(sent)
            ]

            # enforce maximum sentence length
            if len(text_int) > max_len_w or len(text_int) == 0:
                continue

            report_valid[nvalid] = text_int
            wdata['text'] = clean_string(sent)
            wdata['num_words'] = len(text_int)
            report_text_valid[nvalid] = wdata
            nvalid += 1

        report_valid.attrs['nsample'] = nvalid
        report_valid.attrs['vocab_size'] = vocab_size
        report_text_valid.attrs['nsample'] = nvalid
        report_text_valid.attrs['vocab_size'] = vocab_size

    h5f.close()

    return h5_file_name, vocab_file_name

コード例 #32

0

ファイルを表示

ファイル: tester.py プロジェクト: bhaecker/Recipes-generation-with-deep-learning

words = 50
#for choosing out of n best word specify n
n_bestwords = 100
#number of original (!) ingredient samples for random test generation
m = 50

t = pickle.load(open('tokenizer.pickle', 'rb'))
reverse_word_map = dict(map(reversed, t.word_index.items()))
print(t.word_index)
model = load_model('model.hdf5')
max_list_predictors = model._layers[0].batch_input_shape[1]

string = 'Mehl, Eier, Milch Die Eier mit der Milch verrühren und'
print(
    clean_string(
        generate_equal(string, model, t, reverse_word_map, words,
                       max_list_predictors)))

for i in range(2, 15):
    string = generate_testcases('recipes.json', m, i)
    print('Test ingredients:', string)
    print('choose equal plus threshold:')
    print(
        clean_string(
            generate_equal(string, model, t, reverse_word_map, words,
                           max_list_predictors)))
    print('choose from ' + str(n_bestwords) + ' best:')
    print(
        clean_string(
            generate_choose_from_n_best(n_bestwords, string, model, t,
                                        reverse_word_map, words,

コード例 #33

0

ファイルを表示

    def parse(self, response):

        url = response.url
        finish_url(self.conn, self.store_id, url)
        items = response.css('.cell-content-wrapper')
        metadata = get_url_metadata(self.cursor, url)
        section = metadata[1]
        subsection = metadata[2]
        #check if it has a next button,
        next_page = response.css('.pagination-next:not(.disabled)').get()
        if next_page is not None:
            #inspect_response(response,self)
            page_string = "?page="
            page_str_len = len(page_string)
            i = url.find(page_string)
            #if yes, check url if it has a page part on it
            if i == -1:
                #if no, add ?page=2 to it
                next_url = url + page_string + "2"
            else:
                #if yes, extract page and add 1
                page_number = i + page_str_len
                current_page = int(url[page_number:])
                next_page = current_page + 1
                next_url = url[:page_number] + str(next_page)
            #then add to self.urls
            store_url(self.conn, next_url, self.store_id,
                      lookup_category("", section, subsection), section,
                      subsection)

        for item in items:
            name = item.css('.cell-title-text ::text').get()
            name = clean_string(name, ['\"'])
            price = item.css('[data-test="amount"] .css-19m8h51 ::text').get()
            price = convert_dollars(price)

            quantity = item.css('[data-test="amount"] .css-cpy6p ::text').get()

            unit = item.css('.cell-product-size ::text').get()
            ounces = convert_to_ounces(unit)

            ppu = item.css('[data-test="per-unit-price"] ::text').get()
            ppu = convert_ppu(ppu)

            print(
                f"name - {name}, price - {price}, quantity - {quantity}, ounces - {ounces}, ppu - {ppu}, url - {url}, section - {section}, subsection - {subsection} "
            )
            #inspect_response(response,self)
            yield {
                "name": name,
                "price": price,
                "ounces": ounces,
                "unit": unit,
                "price-per-unit": ppu,
                "url": url,
                "section": section,
                "subsection": subsection
            }

        next_url = get_next_url(self.cursor, 1)
        if next_url is None:
            print("No more URLs to parse. Finishing")
            return
        request = self.create_parse_request(
            next_url, self.parse,
            EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]')))

        if next_url is not None:
            try:
                yield request
            except:
                print(
                    f"Parse -  Errored out processing request for - {next_url} "
                )
                next_url = get_next_url(self.cursor, 2)
                print(f"Parse - Now handling {next_url}")
                request = self.create_parse_request(
                    next_url, self.parse,
                    EC.element_to_be_clickable(
                        (By.CSS_SELECTOR, '[add-to-cart]')))

            yield SeleniumRequest(url=next_url,
                                  callback=self.parse,
                                  wait_time=50,
                                  wait_until=EC.element_to_be_clickable(
                                      (By.CSS_SELECTOR,
                                       '.button.full.cart.add')))

コード例 #34

0

ファイルを表示

ファイル: Archiver.py プロジェクト: jdigilio/epicsarchiver

 def drop_pv(self,name):
     self.db.execute("update pv set active='no' where name=%s" % clean_string(name))

コード例 #35

0

ファイルを表示

ファイル: data_loader.py プロジェクト: NervanaSystems/neon

def load_data(path, file_ext=['txt'], valid_split=None, vocab_file_name=None,
              max_vocab_size=None, max_len_w=None, output_path=None, subset_pct=100):
    """
    Given a path where data are saved, look for the ones with the right extensions
    If a split factor is given, it will split all the files into training and valid
    set. Then build vocabulary from the training and validation sets.

    Arguments:
        path: which directory to look for all the documents
        file_ext: what extension of the files to look for
        valid_split: to split the data into train/valid set. If None, no split
        vocab_file_name: optional file name. If None, the script will decide a name
                         given path and split
        max_vocab_size: maximum number of words to use in vocabulary (by most frequent)
        max_len_w: maximum length of sentences in words
        output_path: path used to save preprocessed data and resuts
        subset_pct: subset of dataset to load into H5 file (percentage)

    Returns:
        The function saves 2 files:
        h5 file with preprocessed data
        vocabulary file with: vocab, reverse_vocab, word_count
    """
    file_names = get_file_list(path, file_ext)

    file_str = get_file_str(path, len(file_names), labelled=False,
                            valid_split=valid_split, subset_pct=subset_pct)

    # create output dir if needed
    if not os.path.isdir(output_path):
        os.makedirs(output_path)

    # file name to store the vocabulary
    if vocab_file_name is None:
        vocab_file_name = file_str + '.vocab'
        vocab_file_name = os.path.join(output_path, vocab_file_name)

    # If max sizes arent set, assume no limit
    if not max_len_w:
        max_len_w = sys.maxsize
    if not max_vocab_size:
        max_vocab_size = sys.maxsize

    # file name to store the pre-processed train/valid dataset
    h5_file_name = os.path.join(output_path, file_str + '.h5')

    if os.path.exists(h5_file_name) and os.path.exists(vocab_file_name):
        neon_logger.display("dataset files {} and vocabulary file {} already exist. "
                            "will use cached data. ".format(h5_file_name, vocab_file_name))
        return h5_file_name, vocab_file_name

    # split into training/valid set
    if valid_split is not None:
        if 'json' in file_ext:
            # Split based on number of files
            train_split = int(np.ceil(len(file_names) * (1 - valid_split)))
            train_files = file_names[:train_split]
            valid_files = file_names[train_split:]

            train_sent = load_json_sent(train_files, subset_pct)
            valid_sent = load_json_sent(valid_files, subset_pct)
            all_sent = train_sent + valid_sent
        elif 'txt' in file_ext:
            # Split based on number of lines (since only 2 files)
            all_sent = load_txt_sent(file_names, subset_pct)
            train_split = int(np.ceil(len(all_sent) * (1 - valid_split)))

            train_sent = all_sent[:train_split]
            valid_sent = all_sent[train_split:]
        else:
            neon_logger.display("Unsure how to load file_ext {}, please use 'json' or 'txt'."
                                .format(file_ext))
    else:
        train_files = file_names
        if 'json' in file_ext:
            train_sent = load_json_sent(train_files, subset_pct)
        elif 'txt' in file_ext:
            train_sent = load_txt_sent(train_files, subset_pct)
        else:
            neon_logger.display("Unsure how to load file_ext {}, please use 'json' or 'txt'."
                                .format(file_ext))
        all_sent = train_sent

    if os.path.exists(vocab_file_name):
        neon_logger.display("open existing vocab file: {}".format(vocab_file_name))
        vocab, rev_vocab, word_count = load_obj(vocab_file_name)
    else:
        neon_logger.display("Building  vocab file")

        # build vocab
        word_count = defaultdict(int)
        for sent in all_sent:
            sent_words = tokenize(sent)

            if len(sent_words) > max_len_w or len(sent_words) == 0:
                continue

            for word in sent_words:
                word_count[word] += 1

        # sort the word_count , re-assign ids by its frequency. Useful for downstream tasks
        # only done for train vocab
        vocab_sorted = sorted(word_count.items(), key=lambda kv: kv[1], reverse=True)

        vocab = OrderedDict()

        # get word count as array in same ordering as vocab (but with maximum length)
        word_count_ = np.zeros((len(word_count), ), dtype=np.int64)
        for i, t in enumerate(list(zip(*vocab_sorted))[0][:max_vocab_size]):
            word_count_[i] = word_count[t]
            vocab[t] = i
        word_count = word_count_

        # generate the reverse vocab
        rev_vocab = dict((wrd_id, wrd) for wrd, wrd_id in vocab.items())

        neon_logger.display("vocabulary from {} is saved into {}".format(path, vocab_file_name))
        save_obj((vocab, rev_vocab, word_count), vocab_file_name)

    vocab_size = len(vocab)
    neon_logger.display("\nVocab size from the dataset is: {}".format(vocab_size))

    neon_logger.display("\nProcessing and saving training data into {}".format(h5_file_name))

    # now process and save the train/valid data
    h5f = h5py.File(h5_file_name, 'w', libver='latest')
    shape, maxshape = (len(train_sent),), (None)
    dt = np.dtype([('text', h5py.special_dtype(vlen=str)),
                   ('num_words', np.uint16)])
    report_text_train = h5f.create_dataset('report_train', shape=shape,
                                           maxshape=maxshape, dtype=dt,
                                           compression='gzip')
    report_train = h5f.create_dataset('train', shape=shape, maxshape=maxshape,
                                      dtype=h5py.special_dtype(vlen=np.int32),
                                      compression='gzip')

    # map text to integers
    wdata = np.zeros((1, ), dtype=dt)
    ntrain = 0
    for sent in train_sent:
        text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)]

        # enforce maximum sentence length
        if len(text_int) > max_len_w or len(text_int) == 0:
            continue

        report_train[ntrain] = text_int

        wdata['text'] = clean_string(sent)
        wdata['num_words'] = len(text_int)
        report_text_train[ntrain] = wdata
        ntrain += 1

    report_train.attrs['nsample'] = ntrain
    report_train.attrs['vocab_size'] = vocab_size
    report_text_train.attrs['nsample'] = ntrain
    report_text_train.attrs['vocab_size'] = vocab_size

    if valid_split:
        neon_logger.display("\nProcessing and saving validation data into {}".format(h5_file_name))
        shape = (len(valid_sent),)
        report_text_valid = h5f.create_dataset('report_valid', shape=shape,
                                               maxshape=maxshape, dtype=dt,
                                               compression='gzip')
        report_valid = h5f.create_dataset('valid', shape=shape, maxshape=maxshape,
                                          dtype=h5py.special_dtype(vlen=np.int32),
                                          compression='gzip')
        nvalid = 0
        for sent in valid_sent:
            text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)]

            # enforce maximum sentence length
            if len(text_int) > max_len_w or len(text_int) == 0:
                continue

            report_valid[nvalid] = text_int
            wdata['text'] = clean_string(sent)
            wdata['num_words'] = len(text_int)
            report_text_valid[nvalid] = wdata
            nvalid += 1

        report_valid.attrs['nsample'] = nvalid
        report_valid.attrs['vocab_size'] = vocab_size
        report_text_valid.attrs['nsample'] = nvalid
        report_text_valid.attrs['vocab_size'] = vocab_size

    h5f.close()

    return h5_file_name, vocab_file_name

コード例 #36

0

ファイルを表示

ファイル: non-logging-version.py プロジェクト: fredhdx/snhlivedownloader

def _get_downloadable_from_url(video_url, resolution):
    global RESOLUTION
    global ERROR_MSG
    global HEADER

    # 解析m3u8地址
    start_time = time.time()
    while True:
        try:
            r = requests.get(video_url, headers=HEADER)
            break
        except requests.ConnectionError:
            if time.time() > start_time + CONNECTION_TIMEOUT:
                raise Exception("Unable to get video_url %s \nafter %s seconds of ConnectionErrors" \
                        % (video_url,CONNECTION_TIMEOUT))
            else:
                time.sleep(1)

    video_html = etree.HTML(r.text)
    title = video_html.xpath('//span[@class="title1"]')[0].text  # 《命运的X号》剧场公演
    info = video_html.xpath(
        '//span[@class="title2"]')[0].text  # TeamX 剧场公演 2018.01.04

    # 文件名
    fname = title
    if not fname.startswith("《"):
        fname = "《" + fname + "》"

    date_string = util.crush_time(
        info)[:10]  # if no date found, use info[:10] part

    fname = date_string + ' ' + fname  # 《48狼人杀》 20180202

    if "星梦Mini" in fname:
        fname = fname + ' ' + re.sub(
            '本期成员：', '',
            re.search(r'.*' + date_string[:4], info).group(0)[:-4])
    if "48狼人杀" in fname or "公演" in fname:
        fname = fname + ' ' + re.search(r'.*' + date_string[:4],
                                        info).group(0)[:-4]

    fname = util.clean_string(fname, 'filename')

    chao_url = video_html.xpath('//input[@id="chao_url"]/@value')[0]
    gao_url = video_html.xpath('//input[@id="gao_url"]/@value')[0]
    liuchang_url = video_html.xpath('//input[@id="liuchang_url"]/@value')[0]

    # 默认：超清源
    RESOLUTION = resolution
    if RESOLUTION == 'chaoqing':
        if chao_url == "" or requests.get(
                chao_url, timeout=CONNECTION_TIMEOUT).text == "\n":
            print("未找到超清源,降低视频清晰度")
            RESOLUTION = "gaoqing"
            m3u8_url = gao_url
        else:
            m3u8_url = chao_url

    if RESOLUTION == 'gaoqing':
        if not gao_url or requests.get(gao_url,
                                       timeout=CONNECTION_TIMEOUT,
                                       headers=HEADER).text == "\n":
            print("未找到高清源,降低视频清晰度")
            RESOLUTION = "liuchang"
            m3u8_url = liuchang_url
        else:
            m3u8_url = gao_url

    if RESOLUTION == 'liuchang':
        if not liuchang_url or requests.get(liuchang_url,
                                            timeout=CONNECTION_TIMEOUT,
                                            headers=HEADER).text == "\n":
            print("未找到流畅源,skip current operation: %s" % title)
            return {}  # return empty object
        else:
            m3u8_url = liuchang_url

    # 解析当页可用视频m3u8
    ts_list = _get_ts_from_m3u8(m3u8_url)

    print("已解析: %s" % fname)
    return {
        'title': title,
        'info': info,
        'fname': fname,
        'm3u8_url': m3u8_url,
        'site_url': video_url,
        'ts_list': ts_list
    }

コード例 #37

0

ファイルを表示

ファイル: active_scraper.py プロジェクト: Sutedja/races

              # "Awards",
               "Notes")
    output = csv.DictWriter(f, headers)
    output.writeheader()

    events = soup.find("section", "section activities-block refined-search-container").find("div", {"id": "lpf-tabs2-a"}).find_all("article")
    for event in events:
        event_name = get_text_if_exists(event, "h5", {"class": "title"})
        event_date = event.find("span", {"itemprop": "startDate"})["content"].split("T")[0]
        lat, long = [float(x) for x in event["data-geo-point"].split(',')]
        detail_url = 'http://www.active.com' + event.find("a", "ie-article-link" )["href"]
        types = get_text_if_exists(event, "h6", {"class":"secondary-text desc-info pull-left"})
        detail_soup = BeautifulSoup(urlopen(detail_url).read())
        event_day = get_text_if_exists(detail_soup.find("div", "visible-desktop"), "h5").split(",")[0]
        address_name = get_text_if_exists(detail_soup, "span", {"itemprop": "name"})
        address = clean_string(get_text_if_exists(detail_soup, "span", {"itemprop": "address"}), utf8=True)
        notes = clean_string(get_text_if_exists(detail_soup, "div", {"itemprop": "description"}), utf8=True)
        prices = []
        has_prices = detail_soup.find("div", "price-grid")
        if has_prices:
            name_prices = has_prices.find_all("div", "row price-row")
            for name_price in name_prices:
                event_type = get_text_if_exists(name_price, "h5", {"itemprop": "name"})
                price = get_text_if_exists(name_price, "h5", {"itemprop": "Price"})
                prices.append((event_type, price))
        event_dict = {"Date": None,
                       "Day": event_day,
                       "Event Name": event_name,
                       "url": detail_url,
                       "Types": types,
                       "Location": address,

コード例 #38

0

ファイルを表示

ファイル: groceryScraper.py プロジェクト: gobfink/Groceries

    def parse(self, response):
        url = response.url
        self.logger.info(f"Inside parse for {url}")

        GROCERY_SELECTOR = '[data-automation-id="productTile"]'
        SPONSORED_SELECTOR = '[data-automation-id="sponsoredProductTile"]'
        GROCERIES_SELECTOR = GROCERY_SELECTOR + ',' + SPONSORED_SELECTOR
        metadata=get_url_metadata(self.cursor,url)
        section=metadata[1]
        subsection=metadata[2]

        for grocery in response.css(GROCERIES_SELECTOR):
            NAME_SELECTOR = '[data-automation-id="name"] ::attr(name)'
            name = grocery.css(NAME_SELECTOR).extract_first()
            #parse the ounces off of the name
            decimal_regex = "([\d]+[.]?[\d]*|[.\d]+)"
            ounces = re.findall(decimal_regex + "\s*o(?:z|unces?)",
                                     name, re.IGNORECASE)
            pounds = re.findall(decimal_regex + "\s*(?:pound|lb)s?",
                                     name, re.IGNORECASE)
            count = re.findall("([\d]+)\s*(?:c(?:t|ount)|p(?:k|ack))",
                                    name, re.IGNORECASE)
            self.ounce = ounces
            self.pounds = pounds
            self.count = count
            #Check if the arrays returned from re.findall are empty
            if ounces:
                ounces = parse_float(ounces[0])
            else:
                ounces = 0
            if pounds:
                pounds = parse_float(pounds[0])
            else:
                pounds = 0
            if count:
                count = parse_float(count[0])
            else:
                count = 0

            if pounds != 0:
                ounces = 16*pounds
            elif count != 0:
                ounces *= count

            #            inspect_response(response,self)
            SALEPRICE_SELECTOR = '[data-automation-id="salePrice"] ::text'
            PRICE_SELECTOR = '[data-automation-id="price"] ::text'
            PRICE_PER_UNIT_SELECTOR = '[data-automation-id="price-per-unit"] ::text'

            name=grocery.css(NAME_SELECTOR).extract_first()
            name=clean_string(name,"\"")
            ounces=ounces
            pounds=pounds
            count=count
            price=str(handle_none(grocery.css(SALEPRICE_SELECTOR).extract_first())).replace('$','')
            ppu=convert_ppu(grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first())

            yield {
                'name': name,
                'ounces': ounces,
                'pounds': pounds,
                'count': count,
                'price': price,
                'price-per-unit': ppu,
                'section': section,
                'subsection': subsection,
                'url': url,
            }

        finish_url(self.conn,self.store_id,url)
        next_url=get_next_url(self.cursor,1,store_id=self.store_id,filter="aisle=")

        print(f"next_url - {next_url}")
        if next_url is None:
            print ("No more urls - finishing")
        else:
            request = create_parse_request(next_url,
                                           self.parse,
                                           EC.element_to_be_clickable(
                                          (By.CSS_SELECTOR, '[aria-current="page"]')),
                                           meta_url=next_url)
            yield request

コード例 #39

0

ファイルを表示

ファイル: Cache.py プロジェクト: kmpeters/epicsarchiver

 def set_value(self,pv=None,**kws):
     v    = [clean_string(i) for i in [pv.value,pv.char_value,time.time()]]
     v.append(pv.pvname)
     qval = "update cache set value=%s,cvalue=%s,ts=%s where pvname='%s'" % tuple(v)
     self.db.execute(qval)