def imagebam(url, name, dest, delim, digits, number): print "Downloading images from [imagebam]...\n" # gallery page numbers (ascending) page_count = [int(el.contents[0]) for el in get_elements(url, "a.pagination_link")] if page_count: # multi-page gallery links = get_imagebam_htmlcode_links(url, page_count[-1]) else: # single-page gallery links = get_page_links(url, lambda x: "imagebam.com" in x) # remove any duplicate links links = list(unique_everseen(links)) regex = re.compile(r'\.[a-zA-Z]*$', re.IGNORECASE) for link in links: try: # source image (i.e. "Open image in a new tab") src = [el['src'] for el in get_elements(link, 'img') if 'id' in el.attrs] if len(src) > 0: # image URL image_url = src[0] # filetype ext = regex.search(image_url) if ext is None: ext = ".jpg" else: ext = ext.group(0) # output filename new_name = set_name(name, ext, delim, number, digits) # download download_file(image_url, new_name, dest, number) number += 1 except: pass
def words(self): """Harvest all words enclosed in <p> tags in webpage source Yields ------- str Single word which is not in list of excluded words """ if self.soup is None: return None paragraphs = self.soup.find_all('p') # header_text = [self.soup.find_all("h{}".format(i)) for i in range(1, 7)] if paragraphs == []: return None all_words = [each.text for each in paragraphs] paragraph_text = ' '.join(all_words) # text_all_page = self.soup.get_text() words = [word.lower().strip() for word in re.split(self.split_string, paragraph_text) if word not in self.stop_words] for each_word in unique_everseen(words): # unique words yield each_word
# First we'll do a regular IR experiment with BM25 documents = {doc_id: text for doc_id, text in read_dreams("data/dreambank.en.stanford.out")} labels = list(read_labels("data/" + labelfile)) y, X = zip(*match_labels_documents(documents, labels)) y, X = np.array(y), np.array(X) kf = KFold(len(y), n_folds=10, shuffle=True, random_state=1) rank_scores = np.zeros(10) for i, (train, test) in enumerate(kf): X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] labels = Counter(flatten(list(y_train))) labels = [label for label, count in labels.items() if count >= 1] model = IRSystem(k1=1.2, b=0.75, cutoff=0) model.fit_raw(X_train, y_train, ngram_range=(1, 1), stop_words='english', min_df=2) ranking = model.rank_labels(X_test, raw=True) ranking = ranking.tolist() ranking = map(lambda r: list(unique_everseen(r)), map(flatten, ranking)) ranking, y_test = zip(*[(r, y_) for r, y_ in zip(ranking, y_test) if any(l in labels for l in y_)]) rank_scores[i] = mean_average_precision(ranking, y_test) print 'IR: (%s)' % (labelfile), rank_scores.mean(), rank_scores.std() # Next, we'll do an IR experiment with Big Documents documents = {doc_id: text for doc_id, text in read_dreams("data/dreambank.en.stanford.out")} labels = list(read_labels("data/" + labelfile)) y, X = zip(*match_labels_documents(documents, labels)) y, X = np.array(y), np.array(X) kf = KFold(len(y), n_folds=10, shuffle=True, random_state=1) rank_scores = np.zeros(10) for i, (train, test) in enumerate(kf): X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] big_docs = defaultdict(str) for (labels, doc) in zip(y_train, X_train):