def fml(inp): ".fml [id] -- Gets a random quote from fmyfife.com. Optionally gets [id]." inp = inp.replace("#", "") if inp: if not inp.isdigit(): return "Invalid ID!" try: page = http.get(urljoin(base_url, inp)) except (HTTPError, IOError): return "Could not fetch #%s. FML" % inp else: try: page = http.get(urljoin(base_url, 'random')) except (HTTPError, IOError): return "I tried to use .fml, but it was broken. FML" soup = BeautifulSoup(page) soup.find('div', id='submit').extract() post = soup.body.find('div', 'post') try: id = int(post.find('a', 'fmllink')['href'].split('/')[-1]) except TypeError: return "Could not fetch #%s. FML" % inp body = misc.strip_html(' '.join(link.renderContents() for link in post('a', 'fmllink'))) return '(#%d) %s' % (id, body)
def get_fact(): page = http.get('http://www.omg-facts.com/random') soup = BeautifulSoup(page) container = soup.find('a', {'class': 'surprise'}) link = container['href'] fact = misc.strip_html(container.renderContents()) if fact: return (fact, link) else: raise nofact
def sloganizr(inp, nick=None, say=None, input=None): ".slogan <word> -- Makes a slogan for <word>." slogan = sloganize(inp) slogan = misc.strip_html(slogan) if inp.islower(): slogan = slogan.split() slogan[0] = slogan[0].capitalize() slogan = " ".join(slogan) return slogan
def convert_text_to_words(raw_qa_s): """Convert and filter the i/p text string into a string of words. Convert a raw stackoverflow question or answer to a string of meaningful words for detailed analysis. The input is a single string of text. That content is processed in various ways, eg, remove HTML, remove non-letters, convert to lower-case, and remove stop words that clutter the output. Return a single string of meaningful words. """ # 1. Remove HTML qa_text = ut.strip_html(raw_qa_s, "lxml") ### # 2. Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", qa_text) # 3. Convert to lower case, split into individual words words = letters_only.lower().split() # 4. In Python, searching a set is much faster than searching # a list, so convert the stop words to a set stops = set(stopwords.words("english")) # Add more noise terms to stopwords. stops.add('th') # 5. Remove stop words #ORG meaningful_words = [w for w in words if not w in stops] meaningful_words = [w for w in words if w not in stops] # 6. Join the words back into one string of words, each word # separated by a space, and return the resulting string. return(" ".join(meaningful_words))
def calc(inp): ".calc <term> -- Calculate <term> with Google Calc." white_re = re.compile(r'\s+') page = http.get('http://www.google.com/search', q=inp) soup = BeautifulSoup(page) response = soup.find('h2', {'class': 'r'}) if response is None: return "Could not calculate " + inp output = response.renderContents() output = ' '.join(output.splitlines()) output = output.replace("\xa0", ",") output = white_re.sub(' ', output.strip()) output = output.decode('utf-8', 'ignore') output = misc.strip_html(output) return output
def write_df_to_otl(in_df, wdir, wfile, columns_l): """Write full contents of some columns of a data frame to an otl file. Open that file w/ Vim + VimOutliner for easy overview of all questions, and quick navigation. Use the list of columns specified in this function if caller does not specify such a list. """ if in_df.empty: print('WARN: write*otl(): Input dataframe empty or not found.') return pd.set_option('display.max_colwidth', -1) # -1=no limit, for debug outfile = wdir + wfile save_prior_file(wdir, wfile) # Specify default output columns to use. if not columns_l: columns_l = [ 'Id', 'Title', 'Body', 'Score', 'HSTCount', 'HiScoreTerms', 'OwnerUserId', 'ParentId' ] # # Save o/p to a string and do not specify an output file in # calling to_string(). # Use 'index=False' to prevent showing index in column 1. in_s = in_df[columns_l].to_string(header=False, index=False) print('#D-write1, len in_s: ', len(in_s)) #D #TBD, Debug, #D import pdb #D pdb.set_trace() #D print() #D print('#D-write_otl in_s: ', in_s[:999]) # # Delete long strings of spaces at end of each line. # Replace blank spaces at end of each line w/ only the newline char. # Do this for all matching patterns in the string in one cmd. out_s = in_s out_s = re.sub(' +\n', '\n', out_s) # Convert html line breaks to newlines before stripping html. out_s = re.sub(r'<br>', '\n ', out_s) out_s = re.sub(r'<br/>', '\n ', out_s) # Clean the newlines in the string so each line has proper indent. out_s = ut.strip_html(out_s, "lxml") out_s = replace_line_breaks_for_otl(out_s) # print('#D-write2, len out_s: ', len(out_s)) #D print('#D-write3, out_s: ', out_s[:599] ) # # Replace empty lines w/ INDENT+## out_s = re.sub(r'\n\s*\n', r'\n ##\n', out_s) print('#D-write4, len out_s: ', len(out_s)) #D print() #D print('#D-write_otl out_s: ', out_s[:999]) with open(outfile, 'w') as f: cf.logger.info('NOTE: Writing data to otl outfile: ' + outfile) f.write(out_s) pd.set_option('display.max_colwidth', MAX_COL_WID) # -1=no limit, for debug return