Exemple #1
0
def data_generator(index_dir, window_size, include_stop_words=False):
    """Given a directory and a window size outputs a list of
    (sentence, number of men on screen, 
               number of women on screen,
               mean number of men on screen, 
               mean number of women on screen, 
               channel)

    sentence can be with or without stopwords
    """

    # Open the transcript files
    doc_path = os.path.join(index_dir, 'docs.list')
    lex_path = os.path.join(index_dir, 'words.lex')
    idx_path = os.path.join(index_dir, 'index.bin')

    channel = 'MSNBC'
    var = {'CNN':(1, 82529), 'FOX': (82530, 162639), 'MSNBC': (162640, 246922)}
    SIZE = 20000

    documents = Documents.load(doc_path)
    lexicon = Lexicon.load(lex_path)

    # Getting words
    words = get_lexicon()
Exemple #2
0
def main(index_dir, query, silent, context_size):
    idx_path = os.path.join(index_dir, 'index.bin')
    doc_path = os.path.join(index_dir, 'documents.txt')
    data_path = os.path.join(index_dir, 'data')
    lex_path = os.path.join(index_dir, 'lexicon.txt')

    documents = Documents.load(doc_path)
    documents.configure(data_path)
    lexicon = Lexicon.load(lex_path)

    with CaptionIndex(idx_path, lexicon, documents) as index:
        if len(query) > 0:
            print('Query: ', query)
            run_search(' '.join(query), documents, lexicon, index,
                       context_size, silent)
        else:
            print('Enter a query:')
            while True:
                try:
                    query = input('> ')
                except (EOFError, KeyboardInterrupt):
                    print()
                    break
                query = query.strip()
                if len(query) > 0:
                    try:
                        run_search(query, documents, lexicon, index,
                                   context_size, silent)
                    except:
                        traceback.print_exc()
Exemple #3
0
def main(out_dir: str,
         doc_dir: Optional[str],
         chunk_size: Optional[int] = None):
    assert chunk_size is None or chunk_size > 0

    # Load document names
    if doc_dir:
        docs_to_index = list(sorted(list_docs(doc_dir)))
    else:
        docs_to_index = read_docs_from_stdin()

    os.makedirs(out_dir, exist_ok=True)

    # Load or build a lexicon
    lex_path = os.path.join(out_dir, 'lexicon.txt')
    if not os.path.exists(lex_path):
        lexicon = build_lexicon(docs_to_index, lex_path)
        assert os.path.exists(lex_path), 'Missing: {}'.format(lex_path)
    else:
        lexicon = Lexicon.load(lex_path)

    # Build and store the document list
    docs_path = os.path.join(out_dir, 'documents.txt')
    documents = Documents([
        Documents.Document(id=i, name=d.name)
        for i, d in enumerate(docs_to_index)
    ])
    print('Storing document list: {}'.format(docs_path))
    documents.store(docs_path)
    assert os.path.exists(docs_path), 'Missing: {}'.format(docs_path)

    # Build inverted index chunks and reencode the documents
    index_path = os.path.join(out_dir, 'index.bin')
    data_dir = os.path.join(out_dir, 'data')
    remove_if_exists(index_path)
    remove_if_exists(data_dir)

    os.makedirs(data_dir)
    index_all_docs(docs_to_index, documents, lexicon, index_path, data_dir,
                   chunk_size)

    assert os.path.exists(index_path), 'Missing: {}'.format(index_path)
    print('Done!')
def main(index_dir, workers, limit):
    doc_path = os.path.join(index_dir, 'documents.txt')
    documents = Documents.load(doc_path)

    if limit is None:
        limit = len(documents)

    start_time = time.time()
    with Pool(processes=workers,
              initializer=init_worker,
              initargs=(count_tokens, index_dir)) as pool:
        count = 0
        for n in tqdm(pool.imap_unordered(count_tokens, range(limit)),
                      desc='Counting tokens',
                      total=limit):
            count += n

    print('Scanned {} documents for {} tokens in {:d}ms'.format(
        limit, count, int(1000 * (time.time() - start_time))))
def init_worker(function, index_dir):
    doc_path = os.path.join(index_dir, 'documents.txt')
    data_dir = os.path.join(index_dir, 'data')
    function.documents = Documents.load(doc_path)
    function.documents.configure(data_dir)
Exemple #6
0
from captions import Documents, Lexicon, CaptionIndex, MetadataIndex


INDEX_DIR = '/app/data/index'
DOCUMENTS_PATH = os.path.join(INDEX_DIR, 'docs.list')
LEXICON_PATH = os.path.join(INDEX_DIR, 'words.lex')
INDEX_PATH = os.path.join(INDEX_DIR, 'index.bin')
# METADATA_PATH = os.path.join(INDEX_DIR, 'meta.bin')

print('Loading the document list and lexicon', file=sys.stderr)
try:
    DOCUMENTS
    LEXICON
    INDEX
except NameError:
    DOCUMENTS = Documents.load(DOCUMENTS_PATH)
    LEXICON = Lexicon.load(LEXICON_PATH)
    INDEX = CaptionIndex(INDEX_PATH, LEXICON, DOCUMENTS)


def is_word_in_lexicon(word):
    return word in LEXICON

    
def _get_video_name(p):
    """Only the filename without exts"""
    return Path(p).name.split('.')[0]


def _init_doc_id_to_vid_id():
    video_ids = [v.id for v in Video.objects.all()]
Exemple #7
0
def main(
        index_dir: str,
        new_doc_dir: Optional[str],
        chunk_size: Optional[int] = None,
        skip_existing_names: bool = False
):
    assert chunk_size is None or chunk_size > 0
    doc_path = os.path.join(index_dir, 'documents.txt')
    lex_path = os.path.join(index_dir, 'lexicon.txt')
    index_path = os.path.join(index_dir, 'index.bin')

    old_lexicon = Lexicon.load(lex_path)

    documents = Documents.load(doc_path)

    if new_doc_dir:
        new_docs_to_index = list_docs(new_doc_dir)
    else:
        new_docs_to_index = read_docs_from_stdin()

    assert len(new_docs_to_index) > 0
    tmp_new_docs_to_index = []
    for new_doc in new_docs_to_index:
        if new_doc.name in documents:
            if skip_existing_names:
                print('Skipping: {} is already indexed!'.format(new_doc.name))
            else:
                raise Exception(
                    '{} is already indexed! Aborting.'.format(new_doc.name))
        else:
            tmp_new_docs_to_index.append(new_doc)
    new_docs_to_index = tmp_new_docs_to_index
    if len(new_docs_to_index) == 0:
        print('No new documents to index.')
        return

    # Update lexicon
    new_word_counts = get_word_counts(new_docs_to_index)
    lexicon_words = [
        Lexicon.Word(w.id, w.token, w.count + new_word_counts[w.token]
                     if w.token in new_word_counts else w.count)
        for w in old_lexicon
    ]
    for w in new_word_counts:
        if w not in old_lexicon:
            lexicon_words.append(
                Lexicon.Word(len(lexicon_words), w, new_word_counts[w]))
    lexicon = Lexicon(lexicon_words)

    base_doc_id = len(documents)
    new_documents = [Documents.Document(id=i + base_doc_id, name=d.name)
                     for i, d in enumerate(new_docs_to_index)]

    # Convert existing index.bin to a dirctory if needed
    if os.path.isfile(index_path):
        tmp_index_path = index_path + '.tmp'
        shutil.move(index_path, tmp_index_path)
        os.makedirs(index_path)
        shutil.move(
            tmp_index_path,
            os.path.join(index_path, '{:07d}-{:07d}.bin'.format(
                0, base_doc_id)))
    assert os.path.isdir(index_path)

    # Index the new documents
    index_new_docs(new_docs_to_index, new_documents, lexicon, index_path,
                   os.path.join(index_dir, 'data'), chunk_size)

    # Write out the new documents file
    shutil.move(doc_path, doc_path + '.old')
    all_documents = list(documents)
    all_documents.extend(new_documents)
    Documents(all_documents).store(doc_path)

    # Update to the new lexicon
    lexicon.store(lex_path)

    print('Done!')
def main(index_dir, silent, context_size, folder, use_gender):
    doc_path = os.path.join(index_dir, 'docs.list')
    lex_path = os.path.join(index_dir, 'words.lex')
    idx_path = os.path.join(index_dir, 'index.bin')

    documents = Documents.load(doc_path)
    lexicon = Lexicon.load(lex_path)

    words = get_lexicon()
    stop_words = set(
        list(STOP_WORDS) + [
            "know", "don", "ve", "say", "way", "said", "ll", "think", "thing",
            "don’t", "like", "got", "people", "going", "talk", "right",
            "happened", ">>"
        ])
    print("Stop words", stop_words)

    doc_idxs = range(144, 246923)
    word_idx_dic = {}
    idx_counter = 0

    # Create folder
    if not os.path.exists(folder):
        os.makedirs(folder)
    # Create stemmer
    stemmer = WordNetLemmatizer()
    with CaptionIndex(idx_path, lexicon, documents) as index:
        for doc_id in tqdm.tqdm(doc_idxs):
            dic = {}
            count = 1
            if use_gender:
                intervals_gender = gender_to_time(str(doc_id), gender_reqs)
                postings = []
                for t1, t2 in intervals_gender:
                    postings.extend(index.intervals(int(doc_id), t1, t2))
            else:
                postings = index.intervals(int(doc_id))

            starttime = None

            for p in postings:
                if starttime is None:
                    starttime = p.start

                # Cut after 30s
                if p.end - starttime > 30 * count:
                    pickle.dump(
                        dic,
                        open(
                            os.path.join(
                                folder,
                                'Doc_%d_Chunk_%d.p' % (doc_id, count - 1)),
                            'wb'))
                    dic = {}
                    count += 1
                    starttime = p.end

                # Get words in posting
                tokens = index.tokens(0, p.idx, p.len)
                if not tokens:
                    continue
                for token in tokens:
                    word = words[token]
                    # stemmed_word = stemmer.stem(word)
                    if word not in stop_words and len(word) > 1:
                        stemmed_word = stemmer.lemmatize(word)
                        # print("Word {} -> {}".format(word, stemmed_word))
                        if stemmed_word not in word_idx_dic.keys():
                            word_idx_dic[stemmed_word] = idx_counter
                            idx_counter += 1
                        idx_token = word_idx_dic[stemmed_word]
                        if idx_token in dic:
                            dic[idx_token] += 1
                        else:
                            dic[idx_token] = 1
    pickle.dump(word_idx_dic, open(os.path.join(folder, "word_idx.p"), "wb"))