コード例 #1
0
def imagebam(url, name, dest, delim, digits, number):
    print "Downloading images from [imagebam]...\n"

    # gallery page numbers (ascending)
    page_count = [int(el.contents[0])
                  for el in get_elements(url, "a.pagination_link")]

    if page_count:
        # multi-page gallery
        links = get_imagebam_htmlcode_links(url, page_count[-1])
    else:
        # single-page gallery
        links = get_page_links(url, lambda x: "imagebam.com" in x)

    # remove any duplicate links
    links = list(unique_everseen(links))

    regex = re.compile(r'\.[a-zA-Z]*$', re.IGNORECASE)

    for link in links:
        try:
            # source image (i.e. "Open image in a new tab")
            src = [el['src']
                   for el in get_elements(link, 'img')
                   if 'id' in el.attrs]
            if len(src) > 0:
                # image URL
                image_url = src[0]

                # filetype
                ext = regex.search(image_url)
                if ext is None:
                    ext = ".jpg"
                else:
                    ext = ext.group(0)

                # output filename
                new_name = set_name(name, ext, delim, number, digits)

                # download
                download_file(image_url, new_name, dest, number)
                number += 1
        except:
            pass
コード例 #2
0
ファイル: download.py プロジェクト: appastair/image-dl
def imagebam(url, name, dest, delim, digits, number):
    print "Downloading images from [imagebam]...\n"

    # gallery page numbers (ascending)
    page_count = [int(el.contents[0])
                  for el in get_elements(url, "a.pagination_link")]

    if page_count:
        # multi-page gallery
        links = get_imagebam_htmlcode_links(url, page_count[-1])
    else:
        # single-page gallery
        links = get_page_links(url, lambda x: "imagebam.com" in x)

    # remove any duplicate links
    links = list(unique_everseen(links))

    regex = re.compile(r'\.[a-zA-Z]*$', re.IGNORECASE)

    for link in links:
        try:
            # source image (i.e. "Open image in a new tab")
            src = [el['src']
                   for el in get_elements(link, 'img')
                   if 'id' in el.attrs]
            if len(src) > 0:
                # image URL
                image_url = src[0]

                # filetype
                ext = regex.search(image_url)
                if ext is None:
                    ext = ".jpg"
                else:
                    ext = ext.group(0)

                # output filename
                new_name = set_name(name, ext, delim, number, digits)

                # download
                download_file(image_url, new_name, dest, number)
                number += 1
        except:
            pass
コード例 #3
0
    def words(self):
        """Harvest all words enclosed in <p> tags in webpage source

        Yields
        -------
        str
            Single word which is not in list of excluded words
        """
        if self.soup is None:
            return None

        paragraphs = self.soup.find_all('p')
        # header_text = [self.soup.find_all("h{}".format(i)) for i in range(1, 7)]

        if paragraphs == []:
            return None
        all_words = [each.text for each in paragraphs]
        paragraph_text = ' '.join(all_words)
        # text_all_page = self.soup.get_text()

        words = [word.lower().strip() for word in re.split(self.split_string, paragraph_text) if word not in self.stop_words]

        for each_word in unique_everseen(words): # unique words
            yield each_word
コード例 #4
0
ファイル: dream_experiment.py プロジェクト: dot-Sean/dreams
    # First we'll do a regular IR experiment with BM25
        documents = {doc_id: text for doc_id, text in read_dreams("data/dreambank.en.stanford.out")}
        labels = list(read_labels("data/" + labelfile))
        y, X = zip(*match_labels_documents(documents, labels))
        y, X = np.array(y), np.array(X)
        kf = KFold(len(y), n_folds=10, shuffle=True, random_state=1)
        rank_scores = np.zeros(10)
        for i, (train, test) in enumerate(kf):
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            labels = Counter(flatten(list(y_train)))
            labels = [label for label, count in labels.items() if count >= 1]
            model = IRSystem(k1=1.2, b=0.75, cutoff=0)
            model.fit_raw(X_train, y_train, ngram_range=(1, 1), stop_words='english', min_df=2)
            ranking = model.rank_labels(X_test, raw=True)
            ranking = ranking.tolist()
            ranking = map(lambda r: list(unique_everseen(r)), map(flatten, ranking))
            ranking, y_test = zip(*[(r, y_) for r, y_ in zip(ranking, y_test) if any(l in labels for l in y_)])
            rank_scores[i] = mean_average_precision(ranking, y_test)
        print 'IR: (%s)' % (labelfile), rank_scores.mean(), rank_scores.std()

        # Next, we'll do an IR experiment with Big Documents
        documents = {doc_id: text for doc_id, text in read_dreams("data/dreambank.en.stanford.out")}
        labels = list(read_labels("data/" + labelfile))
        y, X = zip(*match_labels_documents(documents, labels))
        y, X = np.array(y), np.array(X)
        kf = KFold(len(y), n_folds=10, shuffle=True, random_state=1)
        rank_scores = np.zeros(10)
        for i, (train, test) in enumerate(kf):
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            big_docs = defaultdict(str)
            for (labels, doc) in zip(y_train, X_train):