Beispiel #1
0
    def build_and_gather_multiple_arrays(self, save_path):
        print("🌋 Extracting mentions features")
        parallel_process(self.docs, set_feats, n_jobs=self.n_jobs)

        print("🌋 Building and gathering arrays")
        arr =[{'doc': doc,
               'i': i} for i, doc in enumerate(self.docs)]

        # removing parallel process shoud reduce the memory used
        arrays_dicts = list()
        for arr_doc in arr :
            arrays_dicts.append(get_feats(arr_doc['doc'],arr_doc['i']))
        del arr
        #arrays_dicts = parallel_process(arr, get_feats, use_kwargs=True, n_jobs=self.n_jobs)
        gathering_dict = dict((feat, None) for feat in FEATURES_NAMES)
        n_mentions_list = []
        pairs_ant_index = 0
        pairs_start_index = 0
        for n, p, arrays_dict in tqdm(arrays_dicts):
            for f in FEATURES_NAMES:
                if gathering_dict[f] is None:
                    gathering_dict[f] = arrays_dict[f]
                else:
                    if f == FEATURES_NAMES[6]:
                        #print("FEATURE DEALT AT 6,",f)
                        #print("ARRAYS DICT DEALT,",arrays_dict[f])
                        #print("PAIRS ANT INDEX,",pairs_ant_index)
                        #nuremberg =input()
                        array = [a + pairs_ant_index for a in arrays_dict[f]]
                        #print("ARRAY CHANGED AT 6,",array)
                    elif f == FEATURES_NAMES[3]:
                        #print("FEATURE DEALT AT 3,",f)
                        #print("ARRAYS DICT DEALT,",arrays_dict[f])
                        #print("PAIRS ANT INDEX,",pairs_ant_index)
                        #nuremberg2 =input()
                        array = [a + pairs_start_index for a in arrays_dict[f]]
                        #print("ARRAY CHANGED AT 3,",array)
                    else:
                        array = arrays_dict[f]
                    gathering_dict[f] += array
            pairs_ant_index += n
            pairs_start_index += p
            n_mentions_list.append(n)

        for feature in FEATURES_NAMES[:9]:
            print("Building numpy array for", feature, "length", len(gathering_dict[feature]))
            if feature != "mentions_spans":
                array = np.array(gathering_dict[feature])
                if array.ndim == 1:
                    array = np.expand_dims(array, axis=1)
            else:
                array = np.stack(gathering_dict[feature])
            # check_numpy_array(feature, array, n_mentions_list)
            print("Saving numpy", feature, "size", array.shape)
            np.save(save_path + feature, array)
        for feature in FEATURES_NAMES[9:]:
            print("Saving pickle", feature, "size", len(gathering_dict[feature]))
            with open(save_path + feature + '.bin', "wb") as fp:  
                pickle.dump(gathering_dict[feature], fp)
        del arrays_dicts, gathering_dict
def download_csv(files, column, num_thread):
    # download train validation
    for file in files:
        content = pd.read_csv(os.path.join(CSV_ROOT_PATH, file))
        print('entry count:', len(content[column].to_list()))
        print(content.shape[0])
        vids = list(set(content[column].to_list()))

        print('video count:', len(vids))

        parallel_process(vids, download_by_youtube_id, n_jobs=8)

    # check missing
    find_missing_csv(column=column, files=files)
def load_data(directory, label, target_directory):
    if not os.path.exists(directory):
        return []
    earthquake_files = [{
        'earthquake_file': x, 
        'directory': directory, 
        'target_directory': target_directory, 
        'label': label
        } for x in os.listdir(directory) if '.SAC' in x
    ]

    if len(earthquake_files) == 0:
        return []

    parallel_process(earthquake_files, process_file, use_kwargs=True, n_jobs=num_cpu)
Beispiel #4
0
def read_all_images(root, num_workers=4):
    classes, class_to_idx = find_classes(root)
    dataset = make_dataset(root, class_to_idx)
    if len(dataset) == 0:
        raise (RuntimeError("Found 0 images in subfolders of: " + root + "\n" +
                            "Supported image extensions are: " +
                            ",".join(IMG_EXTENSIONS)))

    num_images = len(dataset)
    paths = [dataset[i_image][0] for i_image in range(num_images)]

    print("Reading {0} images with {1} workers".format(num_images,
                                                       num_workers))
    if num_workers > 1:
        images = parallel_process(paths,
                                  read_image_for_pytorch,
                                  n_jobs=num_workers)
    else:
        images = []
        for p in tqdm(paths):
            images.append(read_image_for_pytorch(p))

    image_cache = {}
    for i, image in enumerate(images):
        path, target = dataset[i]
        image_cache[path] = image
    return image_cache
Beispiel #5
0
def extract_mentions_spans(doc, blacklist=True, debug=False):
    '''
    Extract potential mentions from a spacy parsed Doc
    '''
    if debug: print('===== doc ====:', doc)
    for c in doc:
        if debug:
            print("🚧 span search:", c, "head:", c.head, "tag:", c.tag_,
                  "pos:", c.pos_, "dep:", c.dep_)
    # Named entities
    mentions_spans = list(ent for ent in doc.ents
                          if ent.label_ in ACCEPTED_ENTS)

    if debug:
        print("==-- ents:", list(
            ((ent, ent.label_) for ent in mentions_spans)))
    for spans in parallel_process([{
            'doc': doc,
            'span': sent,
            'blacklist': blacklist
    } for sent in doc.sents],
                                  _extract_from_sent,
                                  use_kwargs=True,
                                  front_num=0):
        mentions_spans = mentions_spans + spans
    spans_set = set()
    cleaned_mentions_spans = []
    for spans in mentions_spans:
        if spans.end > spans.start and (spans.start,
                                        spans.end) not in spans_set:
            cleaned_mentions_spans.append(spans)
            spans_set.add((spans.start, spans.end))

    return cleaned_mentions_spans
Beispiel #6
0
    def build_and_gather_multiple_arrays(self, save_path):
        print("🌋 Extracting mentions features")
        parallel_process(self.docs, set_feats, n_jobs=self.n_jobs)

        print("🌋 Building and gathering arrays")
        arr = [{'doc': doc, 'i': i} for i, doc in enumerate(self.docs)]
        arrays_dicts = parallel_process(arr,
                                        get_feats,
                                        use_kwargs=True,
                                        n_jobs=self.n_jobs)
        gathering_dict = dict((feat, None) for feat in FEATURES_NAMES)
        n_mentions_list = []
        pairs_ant_index = 0
        pairs_start_index = 0
        for n, p, arrays_dict in tqdm(arrays_dicts):
            for f in FEATURES_NAMES:
                if gathering_dict[f] is None:
                    gathering_dict[f] = arrays_dict[f]
                else:
                    if f == FEATURES_NAMES[6]:
                        array = [a + pairs_ant_index for a in arrays_dict[f]]
                    elif f == FEATURES_NAMES[3]:
                        array = [a + pairs_start_index for a in arrays_dict[f]]
                    else:
                        array = arrays_dict[f]
                    gathering_dict[f] += array
            pairs_ant_index += n
            pairs_start_index += p
            n_mentions_list.append(n)

        for feature in FEATURES_NAMES[:9]:
            print("Building numpy array for", feature, "length",
                  len(gathering_dict[feature]))
            if feature != "mentions_spans":
                array = np.array(gathering_dict[feature])
                if array.ndim == 1:
                    array = np.expand_dims(array, axis=1)
            else:
                array = np.stack(gathering_dict[feature])
            # check_numpy_array(feature, array, n_mentions_list)
            print("Saving numpy", feature, "size", array.shape)
            np.save(save_path + feature, array)
        for feature in FEATURES_NAMES[9:]:
            print("Saving pickle", feature, "size",
                  len(gathering_dict[feature]))
            with open(save_path + feature + '.bin', "wb") as fp:
                pickle.dump(gathering_dict[feature], fp)
Beispiel #7
0
def evaluate_flows_batch(srcnodes_list, destnodes_list, spm, n_threads=1):
    if n_threads == 1:
        flow_values_all = []
        for srcnodes, destnodes in tqdm(list(zip(srcnodes_list, destnodes_list))):
            flow_values_all.append(evaluate_flows_batch_p([srcnodes, destnodes, spm]))
    else:
        raise NotImplementedError('Parallel processing still doesnt work')
        procdata = list(zip(list(srcnodes_list), list(destnodes_list),list([spm]*len(list(srcnodes_list)))))
        flow_values_all = utils.parallel_process(evaluate_flows_batch_p, procdata)
    return np.asarray(flow_values_all).T
def read_all_images(root, num_workers=4):
    classes, class_to_idx = find_classes(root)
    dataset = make_dataset(root, class_to_idx)
    if len(dataset) == 0:
        raise (RuntimeError("Found 0 images in subfolders of: " + root + "\n" +
                            "Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))

    num_images = len(dataset)
    paths = [dataset[i_image][0] for i_image in range(num_images)]

    print("Reading {0} images with {1} workers".format(num_images, num_workers))
    if num_workers > 1:
        images = parallel_process(paths, read_image_for_pytorch, n_jobs=num_workers)
    else:
        images = []
        for p in tqdm(paths):
            images.append(read_image_for_pytorch(p))

    image_cache = {}
    for i, image in enumerate(images):
        path, target = dataset[i]
        image_cache[path] = image
    return image_cache
Beispiel #9
0
 def build_key_file(self, data_path, key_file, debug=False):
     print("🌋 Building key file from corpus")
     print("Saving in", key_file)
     # Create a pool of processes. By default, one is created for each CPU in your machine.
     with io.open(key_file, "w", encoding='utf-8') as kf:
         if debug: print("Key file saved in", key_file)
         for dirpath, _, filenames in os.walk(data_path):
             print("In", dirpath)
             file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \
                         or f.endswith(".v4_gold_conll")]
             cleaned_file_list = []
             for f in file_list:
                 fn = f.split('.')
                 if fn[1] == "v4_auto_conll":
                     gold = fn[0] + "." + "v4_gold_conll"
                     if gold not in file_list:
                         cleaned_file_list.append(f)
                 else:
                     cleaned_file_list.append(f)
         #self.load_file(file_list[0])
             doc_list = parallel_process(cleaned_file_list, read_file)
             for doc in doc_list:
                 kf.write(doc)
Beispiel #10
0
from torchvision import transforms

ROOT = '/data0/EEV/data-frames/00DCWMfJIpc'

ttf = transforms.Compose([
    transforms.Resize(299),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])


def a(i):
    return Image.open(os.path.join(ROOT, i)), i[:-4]


st = time.time()

files = os.listdir('/data0/EEV/data-frames/00DCWMfJIpc')
# print(files)
res = parallel_process(files, a, n_jobs=20)
res = sorted(res, key=lambda t: t[1])

load_time = time.time() - st
st = time.time()
# print([x[1] for x in res])

res = [x[1] for x in res]

print(res)
print('1288', 'transform', time.time() - st, 'load', load_time)
# 4.6070 1377 imgs  7.2s total
Beispiel #11
0
 def parallel_comment(self, c_list=[]):
     #  parallel_process(self.comment, self.comment_list)
     parallel_process(self.com_wrapper, c_list, cpu_cnt=2)
	def build_and_gather_multiple_arrays(self, save_path,train_phase):
		print("🌋 Extracting mentions features")
		parallel_process(self.docs, set_feats, n_jobs=self.n_jobs)
		#for doc in self.docs :
		#	set_feats(doc)

		print("🌋 Building and gathering arrays")
		arr =[{'doc': doc,
			   'i': i} for i, doc in enumerate(self.docs)]
		#build_gather_array = input("Printing Array ")
		#print(arr)
		#arrays_dicts = parallel_process(arr, get_feats, use_kwargs=True, n_jobs=self.n_jobs)
		arrays_dicts = list()
		for arr_doc in arr :
			arrays_dicts.append(get_feats(arr_doc['doc'],arr_doc['i']))
		del arr
		gathering_dict = dict((feat, None) for feat in FEATURES_NAMES)
		n_mentions_list = []
		pairs_ant_index = 0
		pairs_start_index = 0
		for n, p, arrays_dict in tqdm(arrays_dicts):
			for f in FEATURES_NAMES:
				if gathering_dict[f] is None:
					gathering_dict[f] = arrays_dict[f]
				else:
					if f == FEATURES_NAMES[6]:
						array = [a + pairs_ant_index for a in arrays_dict[f]]
					elif f == FEATURES_NAMES[3]:
						array = [a + pairs_start_index for a in arrays_dict[f]]
					else:
						array = arrays_dict[f]
					gathering_dict[f] += array
			pairs_ant_index += n
			pairs_start_index += p
			n_mentions_list.append(n)

		for feature in FEATURES_NAMES[:10]:
			print("Building numpy array for", feature, "length", len(gathering_dict[feature]))
			if feature != "mentions_spans":
				#array = np.array(gathering_dict[feature])
				# check if we are dealing with length of memories
				if feature == "mentions_stories" or feature == "pairs_stories" : 
					train_config = dict()
					max_story_len = 0
					if train_phase :
						max_story_len = max([len(story) for story in gathering_dict[feature]])
						max_story_len = min(200,max_story_len) # max length of the story is 30

						print("max story len, (in train phase should be 200)",max_story_len)
						if os.path.exists('train_config.pickle'):
							file_handle_init = open('train_config.pickle','rb')
							train_config = pickle.load(file_handle_init)
							file_handle_init.close()
						file_handle = open('train_config.pickle','wb')
						train_config[feature] = max_story_len
						pickle.dump(train_config,file_handle)
						file_handle.close()
					else :
						file_handle = open('train_config.pickle','rb')
						train_config = pickle.load(file_handle)
						max_story_len = train_config[feature]
						print("max story len is (should be 200),",max_story_len)
						file_handle.close()

					#append_list = [0] # 1 is the embedding size, because now the story cosists of word_idx
					#append_list = 50*[0] # 50 is the embedding size
					#print(type(append_list))
					gathering_array = []
					for story in gathering_dict[feature] :
						#print(len(story[0]))
						#print(len(story[1]))
						#random_pause = input()
						if len(story) > 200 :
							final_story = story[-200:]
						else :
							number_to_append = max(0,max_story_len - len(story))
							#number_to_append = min(number_to_append,50)
							final_story = story + number_to_append*[0]
							#print(final_story)
							#print(len(final_story))
							#random_pause = input()
						gathering_array.append(final_story)
					array = np.array(gathering_array)
					print(array.shape)
				else :
					array = np.array(gathering_dict[feature])

				if array.ndim == 1:
					print("expand_dims for feature, ",feature)
					array = np.expand_dims(array, axis=1)
			else:
				array = np.stack(gathering_dict[feature])
			# check_numpy_array(feature, array, n_mentions_list)
			print("Saving numpy", feature, "size", array.shape)
			#array_save = input()
			np.save(save_path + feature, array)
		for feature in FEATURES_NAMES[9:]:
			print("Saving pickle", feature, "size", len(gathering_dict[feature]))
			with open(save_path + feature + '.bin', "wb") as fp:  
				pickle.dump(gathering_dict[feature], fp)
		del arrays_dicts, gathering_dict
base_directory = 'data/generated/musdb'
os.makedirs(base_directory, exist_ok=True)

num_training = 20000
num_validation = 2000
num_testing = 0
n_jobs = 12
splits = [('train', num_training),
          ('validation', num_validation),
          ('test', num_testing)]

directories = {
    'train': os.path.join('data/musdb', 'train'),
    'validation': os.path.join('data/musdb', 'validation'),
    'test': os.path.join('data/musdb', 'test'),
    'background': None
}

for split, num_split in splits:
    print("Generating %s" % split)
    target_directory = os.path.join(base_directory, split)
    os.makedirs(target_directory, exist_ok=True)
    mixes = [{"i": i,
              'scene_duration': 3.2,
              'max_sources': 4,
              'foreground_directory': directories[split],
              'background_directory': directories['background'],
              "target_directory": target_directory} for i in range(num_split)]

    parallel_process(mixes, create_mixture, n_jobs=n_jobs, use_kwargs=True)
Beispiel #14
0
 def read_corpus(self, data_path, debug=False):
     print("🌋 Reading files")
     for dirpath, _, filenames in os.walk(data_path):
         print("In", dirpath, os.path.abspath(dirpath))
         file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \
                     or f.endswith(".v4_gold_conll")]
         cleaned_file_list = []
         for f in file_list:
             fn = f.split('.')
             if fn[1] == "v4_auto_conll":
                 gold = fn[0] + "." + "v4_gold_conll"
                 if gold not in file_list:
                     cleaned_file_list.append(f)
             else:
                 cleaned_file_list.append(f)
         doc_list = parallel_process(cleaned_file_list, load_file)
         for docs in doc_list:  #executor.map(self.load_file, cleaned_file_list):
             for utts_text, utt_tokens, utts_corefs, utts_speakers, name, part in docs:
                 print("Imported", name)
                 if debug:
                     print("utts_text", utts_text)
                     print("utt_tokens", utt_tokens)
                     print("utts_corefs", utts_corefs)
                     print("utts_speakers", utts_speakers)
                     print("name, part", name, part)
                 self.utts_text += utts_text
                 self.utts_tokens += utt_tokens
                 self.utts_corefs += utts_corefs
                 self.utts_speakers += utts_speakers
                 self.utts_doc_idx += [len(self.docs_names)
                                       ] * len(utts_text)
                 self.docs_names.append((name, part))
     print("utts_text size", len(self.utts_text))
     print("utts_tokens size", len(self.utts_tokens))
     print("utts_corefs size", len(self.utts_corefs))
     print("utts_speakers size", len(self.utts_speakers))
     print("utts_doc_idx size", len(self.utts_doc_idx))
     print("🌋 Building docs")
     for name, part in self.docs_names:
         self.docs.append(
             ConllDoc(name=name,
                      part=part,
                      nlp=None,
                      blacklist=False,
                      consider_speakers=True,
                      embedding_extractor=self.embed_extractor,
                      conll=CONLL_GENRES[name[:2]]))
     print("🌋 Loading spacy model")
     try:
         spacy.info('en_core_web_sm')
         model = 'en_core_web_sm'
     except IOError:
         print("No spacy 2 model detected, using spacy1 'en' model")
         spacy.info('en')
         model = 'en'
     nlp = spacy.load(model)
     print("🌋 Parsing utterances and filling docs")
     doc_iter = (s for s in self.utts_text)
     ##对文本里的每句话,使用spacy将conll文件中的数据提取,分别对应spacy_tokens, conll_tokens, corefs, speaker, doc_id
     ##并使用add_conll_utterance提取其中的mention,初始化Mention对象
     for utt_tuple in tqdm(
             zip(nlp.pipe(doc_iter), self.utts_tokens, self.utts_corefs,
                 self.utts_speakers, self.utts_doc_idx)):
         spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple
         # if debug: print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens)
         doc = spacy_tokens
         # if debug:
         #     out_str = "utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + \
         #               " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id)
         #     print(out_str.encode('utf-8'))
         ####
         self.docs[doc_id].add_conll_utterance(
             doc,
             conll_tokens,
             corefs,
             speaker,
             use_gold_mentions=self.use_gold_mentions)
                                        relevant=relevant,
                                        k=i)
        mAP[i] = {}
        mAP[i]['precision'] = precision
        mAP[i]['recall'] = recall
    return mAP


print("Beginning to score documents using {}...".format(args.jar))
query_set = list(claim_to_article.keys())

if args.small:
    query_set = query_set[:100]

# result = Parallel(n_jobs=15)(delayed(score_claim)(c) for c in query_set)
result = utils.parallel_process(query_set, score_claim, n_jobs=15)

# print("Saving results to disk...")
# with open("result.pkl", "wb") as f:
# pickle.dump(result, f)

mAP = utils.calculatemAP(result, k)
print(utils.query_customized_lucene("testing", k=5, jar_name=args.jar)[0])
print("Mean Average Precision:")
utils.displaymAP(mAP)
recalls = []
recalls.extend(mAP[1]['recall'])
recalls.extend(mAP[5]['recall'])
recalls.extend(mAP[10]['recall'])
recalls.extend(mAP[20]['recall'])
print("Avg recall: {}".format(np.mean(recalls)))
Beispiel #16
0
nodes_src_list = map(gm.gid2id, srcmodules)

print "Evaluating flows..."


def wrap(pair):
    nodes_dest, nodes_src = pair[0], pair[1]
    return flower.eval_flow_centrality(nodes_dest,
                                       nodes_src,
                                       spm=spm_data,
                                       progressbar=False)


if args.n_cores > 1:
    flow_values_all = utils.parallel_process(
        wrap,
        zip(nodes_dest_list, nodes_src_list),
    )
else:
    flow_values_all = map(
        wrap,
        zip(nodes_dest_list, nodes_src_list),
    )
flow_values_all = np.asarray(flow_values_all)

flow_values = flow_values_all[:, :, 0].T

flow_values_mean = flow_values.mean(axis=1)
flow_values_std = flow_values.std(axis=1)

data = pd.DataFrame(zip(nodelist, flow_values_mean, flow_values_std),
                    columns=['NodeID', 'Flow_mean', 'Flow_std'])
        diff = set(find_missing(vids))
        vids = list(set(vids) - diff)

        filename = os.path.splitext(file)[0]
        with open('vidlist_%s.txt' % (filename), 'w') as file:
            file.write('\n'.join(vids))

if __name__ == '__main__':
    args = parser.parse_args()
    if args.input_list:
        if os.path.exists(args.input_list):
            with open(args.input_list, 'r') as file:
                vids = file.readlines()
                print(len(vids))
                parallel_process(vids, download_by_youtube_id, n_jobs=args.num_thread)
    elif args.find_missing != None:
        print('Find missing videos:', ['Train/val', 'Test'][args.find_missing])
        find_missing_csv(column=['YouTube ID', 'Video ID'][args.find_missing], files=[['train.csv', 'val.csv'], ['test.csv']][args.find_missing])
    elif args.download_tests:
        print('Downloading test...')
        download_csv(files=['test.csv'], column='Video ID', num_thread=args.num_thread)
    elif args.gen_vidmap:
        print('Generate vid to index map')
        # gen_vidmap_csv(files=['train.csv', 'val.csv'])
        # vid start_idx length
        gen_length_vidmap_csv(column='Video ID', files=['test.csv'])
    elif args.gen_vidlist:
        print('Generate test vid list')
        gen_vid_list()
    else:
Beispiel #18
0
VIDEO_ROOT = '/data0/EEV/data'
FRAME_ROOT = '/data/EEV/data-audio'

# logging.basicConfig(filename='log/ea_{}.log'.format(int(time.time())), filemode='w', level=logging.DEBUG)


def extract(filename):
    """ 
        filename: 12736.mp4 
    """
    assert len(filename) == 15
    video_id = filename[:-4]

    full_path = os.path.join(FRAME_ROOT, video_id)
    # if os.path.exists(full_path + '.wav'):
    #     return

    cmd = 'ffmpeg -i {}/{} -threads 1 -vn -acodec pcm_s16le -ac 1 -ar 16000 {}/{}.wav'.format(
        VIDEO_ROOT, filename, FRAME_ROOT, video_id)

    # extract audio
    # print(cmd.split(' ')) , stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
    f = subprocess.run(cmd.split(' '))


if __name__ == '__main__':
    args = parser.parse_args()
    filenames = os.listdir(VIDEO_ROOT)
    # extract('IHRncab3Cdg.mp4')
    parallel_process(filenames, extract, n_jobs=args.num_thread)