コード例 #1
0
ファイル: emimino.py プロジェクト: sonineties/cancer-works
    def parse(self, response):
        texts = response.xpath('//div[@class="emojione-output"]').extract()
        names = response.xpath('//div[@class="user_in"]/b').extract()
        dates = response.xpath(
            '//div[@class="user_in"]//span[@class="date"]/span/text()'
        ).extract()

        for i in range(len(texts)):
            text = re.sub('<blockquote>.*?<\/blockquote>',
                          '',
                          texts[i],
                          flags=re.DOTALL)
            text = functions.strip_accents(text.strip())
            text = functions.clean_text(text)
            name = re.sub('<[^<]+?>', '', names[i])
            name = functions.strip_accents(name)
            date = functions.process_date_emimino(dates[i].strip().split()[0])

            # add to db
            yield {
                'domain': self.domain,
                'url': self.start_url,
                'text': text,
                'date': date,
                'name': name
            }
コード例 #2
0
    def parse(self, response):
        texts = response.xpath(
            '//div[@class="diskuse-prispevek"]//div[@class="popis"]').extract(
            )
        names = response.xpath(
            '//div[@class="diskuse-prispevek"]//span[@class="jmeno"]').extract(
            )
        dates = response.xpath(
            '//div[@class="diskuse-prispevek"]//span[@class="datum"]').extract(
            )

        for i in range(len(texts)):
            text = texts[i].encode('utf8').replace('<div class="popis">',
                                                   '').replace('</div>', '')
            text = functions.strip_accents(text)
            text = functions.clean_text(text)
            name = names[i].encode('utf8').replace('<span class="jmeno">',
                                                   '').replace('</span>', '')
            name = functions.strip_accents(name)
            date = dates[i].encode('utf8').replace(
                '<span class="datum">',
                '').replace('</span>', '').strip(',').strip().split()[0]
            date = functions.process_date_abc(date)

            # add to db
            yield {
                'domain': self.domain,
                'url': self.start_url,
                'text': text,
                'date': date,
                'name': name
            }
コード例 #3
0
ファイル: rehot_sk.py プロジェクト: gnd/bolka10080
    def parse(self, response):
        jokes = response.xpath('//div[@class="joke-content"]').extract()

        for i in range(len(jokes)):
            joke = jokes[i].encode('utf8').replace(
                '<div class="joke-content">', '').replace('</div>', '')
            joke = functions.strip_accents(joke)
            joke = functions.clean_text(joke)

            # add to db
            yield {'domain': self.domain, 'url': self.start_url, 'joke': joke}
コード例 #4
0
    def parse(self, response):
        texts = response.xpath('//div[@class="emojione-output"]').extract()
        names = response.xpath('//div[@class="user_in"]').extract()
        tmp_dates = response.xpath('//span[@class="date"]').extract()
        dates = []
        for date in tmp_dates:
            if ('přís' not in date):
                dates.append(date)

        for i in range(len(texts)):
            text = texts[i].encode('utf8').replace(
                '<div class="emojione-output">\n\t\t\t\n<p>',
                '').replace('</p>\n\n\t\t</div>', '')
            text = functions.strip_accents(text)
            text = functions.clean_text(text)
            if ('komunita' in names[i]):
                name = re.sub(
                    '<[^<]+?>', '', names[i].encode('utf8').replace(
                        '<div class="user_in">\n \t \n\t\t\t<b>', ''))
                name = name.split('\n')[0]
                name = functions.strip_accents(name)
            else:
                name = names[i].encode('utf8').replace(
                    '<div class="user_in">',
                    '').replace('<b>', '').split('<a name=')[0].strip()
                name = functions.strip_accents(name)
            date = functions.process_date_vitalion(
                dates[i].encode('utf8').replace('<span class="date"><span> ',
                                                '').replace(
                                                    '</span></span>', ''))

            # add to db
            yield {
                'domain': self.domain,
                'url': self.start_url,
                'text': text,
                'date': date,
                'name': name
            }
コード例 #5
0
    def parse(self, response):
        texts = response.xpath('//div[@class="field-item even"]').extract()
        names = response.xpath('//span[@class="username"]').extract()
        dates = response.xpath('//div[@class="small"]').extract()

        for i in range(len(texts)):
            if ('Isaac Asimov' not in texts[i].encode('utf8')):
                text = texts[i].encode('utf8').replace('<div class="field-item even" property="content:encoded">','').replace('</div>','')
                text = functions.strip_accents(text)
                text = functions.clean_text(text)
                name = names[i].encode('utf8').replace('<span class="username" xml:lang="" typeof="sioc:UserAccount" property="foaf:name" datatype="">','').replace('</span>','')
                name = functions.strip_accents(name)
                date = dates[i].encode('utf8').replace('<div class="small"> ','').replace(' </div>','')
                date = functions.process_date_doktorka(date)

            # add to db
            yield {
            'domain': self.domain,
            'url': self.start_url,
            'text': text,
            'date': date,
            'name': name
            }
コード例 #6
0
ファイル: train.py プロジェクト: Acmezon/text-prediction
def train(filename):
	start_time = time.time()

	unigram_letters = 'training/unigram_letter.json'
	bigram_letters = 'training/bigram_letter.json'

	unigram_words = 'training/unigram_words.json'
	bigram_words = 'training/bigram_words.json'

	if filename:
		try:
			os.remove(unigram_letters)
		except OSError:
			pass

		try:
			os.remove(bigram_letters)
		except OSError:
			pass

		try:
			os.remove(unigram_words)
		except OSError:
			pass

		try:
			os.remove(bigram_words)
		except OSError:
			pass
	else:
		filename = 'corpus/corpus_sm.txt'

	s = functions.strip_accents(codecs.open(filename, 'r', encoding='utf-8').read().lower());
	words = re.findall(r'\b[a-z]+\b', s)
	bi_words = re.findall(r'(?=([a-z]+\s+[a-z]+))[a-z]+\s+', s)
	
	letters = re.findall(r'[a-z]', s)
	bi_letters = re.findall(r'(?=([a-z][a-z]))[a-z]', s)

	words_count = Counter(words)
	bi_words_count = Counter(bi_words)
	
	letters_count = Counter(letters)
	bi_letters_count = Counter(bi_letters)

	print("Counters took {0:.2f} seconds".format(time.time() - start_time))

	print("---------Training begins")

	if(not os.path.isfile(unigram_words)):
		train_unigram_words(words_count, unigram_words);
		print("")
		print("---------Unigram words trained")
	else:
		print("---------Unigram words already trained")

	if(not os.path.isfile(bigram_words)):
		train_bigram_words(bi_words_count, bigram_words);
		print("")
		print("---------Bigram words trained")
	else:
		print("---------Bigram words already trained")

	if(not os.path.isfile(unigram_letters)):
		train_unigram_letters(letters_count, unigram_letters);
		print("---------Unigram letters trained")
	else:
		print("---------Unigram letters already trained")

	if(not os.path.isfile(bigram_letters)):
		train_bigram_letters(bi_letters_count, bigram_letters)
		print("")
		print("---------Bigram letters trained")
	else:
		print("---------Bigram letters already trained")

	print("---------Trainig finished: it tooks {0:.2f} seconds".format(time.time() - start_time))
コード例 #7
0
ファイル: train.py プロジェクト: Acmezon/text-prediction
def train(filename):
    start_time = time.time()

    unigram_letters = 'training/unigram_letter.json'
    bigram_letters = 'training/bigram_letter.json'

    unigram_words = 'training/unigram_words.json'
    bigram_words = 'training/bigram_words.json'

    if filename:
        try:
            os.remove(unigram_letters)
        except OSError:
            pass

        try:
            os.remove(bigram_letters)
        except OSError:
            pass

        try:
            os.remove(unigram_words)
        except OSError:
            pass

        try:
            os.remove(bigram_words)
        except OSError:
            pass
    else:
        filename = 'corpus/corpus_sm.txt'

    s = functions.strip_accents(
        codecs.open(filename, 'r', encoding='utf-8').read().lower())
    words = re.findall(r'\b[a-z]+\b', s)
    bi_words = re.findall(r'(?=([a-z]+\s+[a-z]+))[a-z]+\s+', s)

    letters = re.findall(r'[a-z]', s)
    bi_letters = re.findall(r'(?=([a-z][a-z]))[a-z]', s)

    words_count = Counter(words)
    bi_words_count = Counter(bi_words)

    letters_count = Counter(letters)
    bi_letters_count = Counter(bi_letters)

    print("Counters took {0:.2f} seconds".format(time.time() - start_time))

    print("---------Training begins")

    if (not os.path.isfile(unigram_words)):
        train_unigram_words(words_count, unigram_words)
        print("")
        print("---------Unigram words trained")
    else:
        print("---------Unigram words already trained")

    if (not os.path.isfile(bigram_words)):
        train_bigram_words(bi_words_count, bigram_words)
        print("")
        print("---------Bigram words trained")
    else:
        print("---------Bigram words already trained")

    if (not os.path.isfile(unigram_letters)):
        train_unigram_letters(letters_count, unigram_letters)
        print("---------Unigram letters trained")
    else:
        print("---------Unigram letters already trained")

    if (not os.path.isfile(bigram_letters)):
        train_bigram_letters(bi_letters_count, bigram_letters)
        print("")
        print("---------Bigram letters trained")
    else:
        print("---------Bigram letters already trained")

    print("---------Trainig finished: it tooks {0:.2f} seconds".format(
        time.time() - start_time))