def check_headline(headline): headline = HTMLParser().unescape(headline) headline = headline.lower() tokens = nltk.word_tokenize(headline) tagged = NLPParser.t3.tag(tokens) entities = nltk.ne_chunk(tagged, binary=True) parsed = NLPParser.cp.parse(entities) for node in parsed: if type(node) is nltk.Tree and node.label() == 'SBAR': sentence = node.leaves() sentence = ' '.join([t[0] for t in sentence]) return sentence, 'sentence' tag_sequence = [t[1] for t in tagged] for node in parsed: if type(node) is nltk.Tree and node.label() == 'NE': subject = node.leaves() subject = ' '.join([t[0] for t in subject]) return subject, 'topic' np_len = 0 np_topic = [] for node in parsed: if type(node) is nltk.Tree and node.label() == 'NP': np = node.leaves() if len(np) >= np_len and 'PP' not in np: np_len = len(np) np_topic = np if np_topic: np_topic = ' '.join([t[0] for t in np_topic]) return np_topic, 'topic' else: for t in tag_sequence: if re.match(r'VB.*', t): return NLPParser.simple_headline(headline), 'sentence' return NLPParser.simple_headline(headline), 'topic'
def uncode_name( name ): # convert all the &# codes to char, remove extra-space and normalize from HTMLParser import HTMLParser name = name.replace('<![CDATA[', '').replace(']]', '') name = HTMLParser().unescape(name.lower()) return name
def cleanUpMatch(match): # remove everything between double line breaks match = re.sub(r'\n\s*\n.*?\n\s*\n', '', match, flags=re.DOTALL) # remove soft hyphen match = re.sub(r'­(\n)*(<lb/>)*', '', match) # remove box character match = re.sub(r'\xc2\xad(\n)*(<lb/>)*', '', match) # replace <lb/> or '\n' with ' ' match = re.sub(r'<lb/>|\n', ' ', match) # remove everything between '<>' match = re.sub(r'<.*?>', '', match) # translate special characters for char in TRANSLATIONS: match = match.replace(char, TRANSLATIONS[char]) # remove extra white spaces match = ' '.join(match.split()) # decode html characters match = HTMLParser().unescape(match) # always normalize to lower case return match.lower()
def un_code_name(name): """ Convert all the &# codes to char, remove extra-space and normalize :param name: string to convert :type name: str :return: converted string """ from HTMLParser import HTMLParser name = name.replace('<![CDATA[', '').replace(']]', '') name = HTMLParser().unescape(name.lower()) return name
def normalize_string(string, charset=None, replacing=False): """ Decode and Convert to Unicode any string :param charset: encoding :type charset: str :param string: string to convert :type string: str or unicode :param replacing: Whether is ' is replaced :type replacing: bool :return: converted unicode :rtype: unicode """ if not isinstance(string, unicode): try: if re.search(u'=[0-9a-fA-F]{2}', string): string = string.decode('Quoted-printable') string = json.loads(u'%s' % string, encoding=charset) except ValueError: try: string = unicode(eval(string), 'raw_unicode_escape') except (SyntaxError, NameError): string = string.decode('latin-1') pass except TypeError: string = unicode(string, errors='ignore') pass except LookupError: return u'' except TypeError: string = unicode(string, errors='ignore') pass string = remove_control_chars(string) string = fix_bad_unicode(string) string = unquote(string) string = string.replace(u'<![CDATA[', u'').replace(u']]', u'') string = HTMLParser().unescape(string) if replacing: string = string.replace(u"'", '') string = string.lower() return string
def uncodeName(name): # Convert all the &# codes to char, remove extra-space and normalize from HTMLParser import HTMLParser name = name.replace('<![CDATA[', '').replace(']]', '') name = HTMLParser().unescape(name.lower()) return name
def uncode_name(name): # convert all the &# codes to char, remove extra-space and normalize from HTMLParser import HTMLParser name = name.replace("<![CDATA[", "").replace("]]", "") name = HTMLParser().unescape(name.lower()) return name
def populate_restaurants(c): print 'Populating Restaurants table...' if not (os.access('restaurants', os.R_OK) and os.path.isdir('restaurants')): print >> sys.stderr, "Error: cannot access raw data directory 'restaurants'" sys.exit(1) if not (os.access('suburbs.txt', os.R_OK) and os.path.isfile('suburbs.txt')): print >> sys.stderr, "Error: cannot access raw data file 'suburbs.txt'" sys.exit(1) #get postcodes from file and cache in dict suburbs = open('suburbs.txt').readlines() postcodes = {} for suburb in suburbs: lat, lng, pst, sub = suburb.strip().split('\t') postcodes[sub] = pst postcodes['CBD'] = 2000 #special case not in data file users = c.execute('SELECT username FROM Users').fetchall() num_users = c.execute('SELECT COUNT(*) FROM Users').fetchone()[0] i = 0 for restaurant in glob.glob('restaurants/*'): r = open(restaurant).readlines() #extract info from file try: name = r[0].strip() name = HTMLParser().unescape(name) address = r[1].strip() address = HTMLParser().unescape(address) address = re.sub(r'nsw', 'NSW', address, flags=re.I) if not address.endswith(', NSW'): address = address + ', NSW' suburb = re.match(r'.*, (.+), Sydney', r[1]).group(1) suburb = HTMLParser().unescape(suburb) phone = r[2].strip().replace('(', '').replace(')', '') if re.match('Not available', phone): phone = 'Not provided' hours = r[3].strip() hours = re.sub(r'\s*,\s*', ', ', hours) hours = HTMLParser().unescape(hours) cuisine = r[4].strip() cuisine = HTMLParser().unescape(cuisine) cost = r[5].strip() image = r[6].strip() except: print >> sys.stderr, "Error: skipping '%s'" % restaurant continue #lookup postcode using suburb postcode = '' if not suburb in postcodes: continue else: postcode = postcodes[suburb] #and append it to the address address = address + ' ' + str(postcode) #chose a random protocol for the website protocol = 'http://' if random.randint(0, 1) == 1: protocol = 'https://' #make site of the form protocol://www.lowercase.name.of.restaurant.fake.com website = name.replace(' ', ' ').replace(' ', '.').replace( '-', '').strip() + '.fake.com' website = HTMLParser().unescape(website) website = urllib.quote(website) #encode as url website = protocol + 'www.' + website #avoid encoding the protocol website = website.lower().replace('..', '.') #ensure only some restaurants have owners owner = None if random.randint(0, 3) == 0: owner = users[random.randint(0, num_users - 1)][0] i += 1 data = (i, name, suburb, address, postcode, phone, hours, cuisine, owner, website, cost, image) c.execute( '''INSERT INTO Restaurants (id, name, suburb, address, postcode, phone, hours, cuisine, owner, website, cost, image) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', data)
f = codecs.open('vk.txt', mode = 'a', encoding= 'utf-8') f.write('--------------------\n') f.write(u'количество клубов - ' + str(quantity_of_clubs) + u'\n') i = 1 while i <= quantity_of_clubs: club_number = randrange(1000000, 30000000) if club_number not in visited_clubs: visited_clubs.add(club_number) data = urlopen('http://vk.com/club%s' % club_number).read().decode('utf8') title = HTMLParser().unescape(re.findall('<title>(.*)</title>', data)[0]) if title not in ([u'Ошибка', u'Частная группа']): quantity = HTMLParser().unescape(re.findall(u'Участники\s+<em class="pm_counter">(\d+)</em>',data)) if quantity: quantity_of_members.append(int(quantity[0])) words = title.lower().split(' ') for word in words: # отбрасываем слова единичной длины и слова из списка if len(word) != 1 and word not in words_to_exclude: new_word = rx3.sub(u'-', (rx2.sub(u'', rx1.sub(u'',word)))) # удаляем все кроме букв и '-' # отбрасываем пустые слова, слова единичной длины и слова из списка if new_word != '' and len(new_word) != 1 and new_word not in words_to_exclude: words_in_club_titles.append(new_word) i +=1 counts = Counter(words_in_club_titles) top = counts.most_common(10) for element in top: print "'%s', %d " %(element[0], element[1]) f.write("'%s', %d " %(element[0], element[1]))
i = 1 while i <= quantity_of_clubs: club_number = randrange(1000000, 30000000) if club_number not in visited_clubs: visited_clubs.add(club_number) data = urlopen('http://vk.com/club%s' % club_number).read().decode('utf8') title = HTMLParser().unescape( re.findall('<title>(.*)</title>', data)[0]) if title not in ([u'Ошибка', u'Частная группа']): quantity = HTMLParser().unescape( re.findall(u'Участники\s+<em class="pm_counter">(\d+)</em>', data)) if quantity: quantity_of_members.append(int(quantity[0])) words = title.lower().split(' ') for word in words: # отбрасываем слова единичной длины и слова из списка if len(word) != 1 and word not in words_to_exclude: new_word = rx3.sub( u'-', (rx2.sub(u'', rx1.sub( u'', word)))) # удаляем все кроме букв и '-' # отбрасываем пустые слова, слова единичной длины и слова из списка if new_word != '' and len( new_word ) != 1 and new_word not in words_to_exclude: words_in_club_titles.append(new_word) i += 1 counts = Counter(words_in_club_titles) top = counts.most_common(10)